Initial import of ReLAPACKtags/v0.2.20^2
@@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1) | |||
SUBDIRS += lapack | |||
endif | |||
RELA = | |||
ifneq ($(BUILD_RELAPACK), 0) | |||
RELA = re_lapack | |||
endif | |||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | |||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | |||
.PHONY : all libs netlib test ctest shared install | |||
.NOTPARALLEL : all libs prof lapack-test install blas-test | |||
.PHONY : all libs netlib $(RELA) test ctest shared install | |||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test | |||
all :: libs netlib tests shared | |||
all :: libs netlib $(RELA) tests shared | |||
@echo | |||
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" | |||
@echo | |||
@@ -215,6 +220,14 @@ ifndef NO_LAPACKE | |||
endif | |||
endif | |||
ifeq ($(NO_LAPACK), 1) | |||
re_lapack : | |||
else | |||
re_lapack : | |||
@$(MAKE) -C relapack | |||
endif | |||
prof_lapack : lapack_prebuild | |||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||
@@ -326,11 +339,7 @@ endif | |||
@touch $(NETLIB_LAPACK_DIR)/make.inc | |||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
@$(MAKE) -C relapack clean | |||
@rm -f *.grd Makefile.conf_last config_last.h | |||
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | |||
@echo Done. | |||
# Makefile debugging trick: | |||
# call print-VARIABLE to see the runtime value of any variable | |||
print-%: | |||
@echo '$*=$($*)' |
@@ -83,6 +83,9 @@ VERSION = 0.2.20.dev | |||
# Build LAPACK Deprecated functions since LAPACK 3.6.0 | |||
BUILD_LAPACK_DEPRECATED = 1 | |||
# Build RecursiveLAPACK on top of LAPACK | |||
BUILD_RELAPACK = 1 | |||
# If you want to use legacy threaded Level 3 implementation. | |||
# USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
@@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
NO_WARMUP = 1 | |||
# If you want to disable CPU/Memory affinity on Linux. | |||
NO_AFFINITY = 1 | |||
#NO_AFFINITY = 1 | |||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||
# BIGNUMA = 1 | |||
@@ -0,0 +1,22 @@ | |||
The MIT License (MIT) | |||
Copyright (c) 2016 Elmar Peise | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
in the Software without restriction, including without limitation the rights | |||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
copies of the Software, and to permit persons to whom the Software is | |||
furnished to do so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in all | |||
copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
SOFTWARE. | |||
@@ -0,0 +1,98 @@ | |||
TOPDIR = .. | |||
include $(TOPDIR)/Makefile.system | |||
SRC = $(wildcard src/*.c) | |||
SRC1 = \ | |||
src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \ | |||
src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \ | |||
src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \ | |||
src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c | |||
SRC2 = \ | |||
src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ | |||
src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ | |||
src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ | |||
src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ | |||
src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ | |||
src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/ztrsyl_rec2.c \ | |||
src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c \ | |||
src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \ | |||
src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c | |||
SRCX = \ | |||
src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ | |||
src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ | |||
src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ | |||
src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ | |||
src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ | |||
src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \ | |||
src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \ | |||
src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \ | |||
src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c | |||
OBJS1 = $(SRC1:%.c=%.$(SUFFIX)) | |||
OBJS2 = $(SRC2:%.c=%.o) | |||
OBJS = $(OBJS1) $(OBJS2) | |||
TEST_SUITS = \ | |||
slauum dlauum clauum zlauum \ | |||
spotrf dpotrf cpotrf zpotrf \ | |||
spbtrf dpbtrf cpbtrf zpbtrf \ | |||
ssygst dsygst chegst zhegst \ | |||
ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ | |||
sgetrf dgetrf cgetrf zgetrf \ | |||
sgbtrf dgbtrf cgbtrf zgbtrf \ | |||
strsyl dtrsyl ctrsyl ztrsyl \ | |||
stgsyl dtgsyl ctgsyl ztgsyl \ | |||
sgemmt dgemmt cgemmt zgemmt | |||
TESTS = $(TEST_SUITS:%=test/%.pass) # dummies | |||
TEST_EXES = $(TEST_SUITS:%=test/%.x) | |||
LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm | |||
.SECONDARY: $(TEST_EXES) | |||
.PHONY: test | |||
# ReLAPACK compilation | |||
libs: $(OBJS) | |||
@echo "Building ReLAPACK library $(LIBNAME)" | |||
$(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) | |||
$(RANLIB) $(TOPDIR)/$(LIBNAME) | |||
%.$(SUFFIX): %.c config.h | |||
$(CC) $(CFLAGS) -c $< -o $@ | |||
%.o: %.c config.h | |||
$(CC) $(CFLAGS) -c $< -o $@ | |||
# ReLAPACK testing | |||
test: $(TEST_EXES) $(TESTS) | |||
@echo "passed all tests" | |||
test/%.pass: test/%.x | |||
@echo -n $*: | |||
@./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<) | |||
test/s%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
$(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
test/d%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
$(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
test/c%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
$(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
test/z%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
$(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
# cleaning up | |||
clean: | |||
rm -f $(OBJS) test/util.$(SUFFIX) test/*.x |
@@ -0,0 +1,68 @@ | |||
ReLAPACK | |||
======== | |||
[](https://travis-ci.org/HPAC/ReLAPACK) | |||
[Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK) | |||
ReLAPACK offers a collection of recursive algorithms for many of LAPACK's | |||
compute kernels. Since it preserves LAPACK's established interfaces, ReLAPACK | |||
integrates effortlessly into existing application codes. ReLAPACK's routines | |||
not only outperform the reference LAPACK but also improve upon the performance | |||
of tuned implementations, such as OpenBLAS and MKL. | |||
Coverage | |||
-------- | |||
For a detailed list of covered operations and an overview of operations to which | |||
recursion is not efficiently applicable, see [coverage.md](coverage.md). | |||
Installation | |||
------------ | |||
To compile with the default configuration, simply run `make` to create the | |||
library `librelapack.a`. | |||
### Linking with MKL | |||
Note that to link with MKL, you currently need to set the flag | |||
`COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and | |||
`ztrsyl`. For further configuration options see [config.md](config.md). | |||
### Dependencies | |||
ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked | |||
kernels from [LAPACK](http://www.netlib.org/lapack/). There are many optimized | |||
and machine specific implementations of these libraries, which are commonly | |||
provided by hardware vendors or available as open source (e.g., | |||
[OpenBLAS](http://www.openblas.net/)). | |||
Testing | |||
------- | |||
ReLAPACK's test suite compares its routines numerically with LAPACK's | |||
counterparts. To set up the tests (located int `test/`) you need to specify | |||
link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then | |||
`make test` runs the tests. For details on the performed tests, see | |||
[test/README.md](test/README.md). | |||
Examples | |||
-------- | |||
Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the | |||
covered routines applies directly to ReLAPACK. A few separate examples are | |||
given in `examples/`. For details, see [examples/README.md](examples/README.md). | |||
Citing | |||
------ | |||
When referencing ReLAPACK, please cite the preprint of the paper | |||
[Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763): | |||
@article{relapack, | |||
author = {Elmar Peise and Paolo Bientinesi}, | |||
title = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection}, | |||
journal = {CoRR}, | |||
volume = {abs/1602.06763}, | |||
year = {2016}, | |||
url = {http://arxiv.org/abs/1602.06763}, | |||
} |
@@ -0,0 +1,208 @@ | |||
#ifndef RELAPACK_CONFIG_H | |||
#define RELAPACK_CONFIG_H | |||
// ReLAPACK configuration file. | |||
// See also config.md | |||
/////////////////////////////// | |||
// BLAS/LAPACK obect symbols // | |||
/////////////////////////////// | |||
// BLAS routines linked against have a trailing underscore | |||
#define BLAS_UNDERSCORE 1 | |||
// LAPACK routines linked against have a trailing underscore | |||
#define LAPACK_UNDERSCORE BLAS_UNDERSCORE | |||
// Complex BLAS/LAPACK routines return their result in the first argument | |||
// This option must be enabled when linking to MKL for ctrsyl and ztrsyl to | |||
// work. | |||
#define COMPLEX_FUNCTIONS_AS_ROUTINES 0 | |||
#ifdef F_INTERFACE_INTEL | |||
#define COMPLEX_FUNCTIONS_AS_ROUTINES 1 | |||
#endif | |||
#define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES | |||
#define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES | |||
// The BLAS-like extension xgemmt is provided by an external library. | |||
#define HAVE_XGEMMT 0 | |||
//////////////////////////// | |||
// Use malloc in ReLAPACK // | |||
//////////////////////////// | |||
#define ALLOW_MALLOC 1 | |||
// allow malloc in xsygst for improved performance | |||
#define XSYGST_ALLOW_MALLOC ALLOW_MALLOC | |||
// allow malloc in xsytrf if the passed work buffer is too small | |||
#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC | |||
//////////////////////////////// | |||
// LAPACK routine replacement // | |||
//////////////////////////////// | |||
// The following macros specify which routines are included in the library under | |||
// LAPACK's symbol names: 1 included, 0 not included | |||
#define INCLUDE_ALL 1 | |||
#define INCLUDE_XLAUUM INCLUDE_ALL | |||
#define INCLUDE_SLAUUM INCLUDE_XLAUUM | |||
#define INCLUDE_DLAUUM INCLUDE_XLAUUM | |||
#define INCLUDE_CLAUUM INCLUDE_XLAUUM | |||
#define INCLUDE_ZLAUUM INCLUDE_XLAUUM | |||
#define INCLUDE_XSYGST INCLUDE_ALL | |||
#define INCLUDE_SSYGST INCLUDE_XSYGST | |||
#define INCLUDE_DSYGST INCLUDE_XSYGST | |||
#define INCLUDE_CHEGST INCLUDE_XSYGST | |||
#define INCLUDE_ZHEGST INCLUDE_XSYGST | |||
#define INCLUDE_XTRTRI INCLUDE_ALL | |||
#define INCLUDE_STRTRI INCLUDE_XTRTRI | |||
#define INCLUDE_DTRTRI INCLUDE_XTRTRI | |||
#define INCLUDE_CTRTRI INCLUDE_XTRTRI | |||
#define INCLUDE_ZTRTRI INCLUDE_XTRTRI | |||
#define INCLUDE_XPOTRF INCLUDE_ALL | |||
#define INCLUDE_SPOTRF INCLUDE_XPOTRF | |||
#define INCLUDE_DPOTRF INCLUDE_XPOTRF | |||
#define INCLUDE_CPOTRF INCLUDE_XPOTRF | |||
#define INCLUDE_ZPOTRF INCLUDE_XPOTRF | |||
#define INCLUDE_XPBTRF INCLUDE_ALL | |||
#define INCLUDE_SPBTRF INCLUDE_XPBTRF | |||
#define INCLUDE_DPBTRF INCLUDE_XPBTRF | |||
#define INCLUDE_CPBTRF INCLUDE_XPBTRF | |||
#define INCLUDE_ZPBTRF INCLUDE_XPBTRF | |||
#define INCLUDE_XSYTRF INCLUDE_ALL | |||
#define INCLUDE_SSYTRF INCLUDE_XSYTRF | |||
#define INCLUDE_DSYTRF INCLUDE_XSYTRF | |||
#define INCLUDE_CSYTRF INCLUDE_XSYTRF | |||
#define INCLUDE_CHETRF INCLUDE_XSYTRF | |||
#define INCLUDE_ZSYTRF INCLUDE_XSYTRF | |||
#define INCLUDE_ZHETRF INCLUDE_XSYTRF | |||
#define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF | |||
#define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF | |||
#define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF | |||
#define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF | |||
#define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF | |||
#define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF | |||
#define INCLUDE_XGETRF INCLUDE_ALL | |||
#define INCLUDE_SGETRF INCLUDE_XGETRF | |||
#define INCLUDE_DGETRF INCLUDE_XGETRF | |||
#define INCLUDE_CGETRF INCLUDE_XGETRF | |||
#define INCLUDE_ZGETRF INCLUDE_XGETRF | |||
#define INCLUDE_XGBTRF INCLUDE_ALL | |||
#define INCLUDE_SGBTRF INCLUDE_XGBTRF | |||
#define INCLUDE_DGBTRF INCLUDE_XGBTRF | |||
#define INCLUDE_CGBTRF INCLUDE_XGBTRF | |||
#define INCLUDE_ZGBTRF INCLUDE_XGBTRF | |||
#define INCLUDE_XTRSYL INCLUDE_ALL | |||
#define INCLUDE_STRSYL INCLUDE_XTRSYL | |||
#define INCLUDE_DTRSYL INCLUDE_XTRSYL | |||
#define INCLUDE_CTRSYL INCLUDE_XTRSYL | |||
#define INCLUDE_ZTRSYL INCLUDE_XTRSYL | |||
#define INCLUDE_XTGSYL INCLUDE_ALL | |||
#define INCLUDE_STGSYL INCLUDE_XTGSYL | |||
#define INCLUDE_DTGSYL INCLUDE_XTGSYL | |||
#define INCLUDE_CTGSYL INCLUDE_XTGSYL | |||
#define INCLUDE_ZTGSYL INCLUDE_XTGSYL | |||
#define INCLUDE_XGEMMT 0 | |||
#define INCLUDE_SGEMMT INCLUDE_XGEMMT | |||
#define INCLUDE_DGEMMT INCLUDE_XGEMMT | |||
#define INCLUDE_CGEMMT INCLUDE_XGEMMT | |||
#define INCLUDE_ZGEMMT INCLUDE_XGEMMT | |||
///////////////////// | |||
// crossover sizes // | |||
///////////////////// | |||
// default crossover size | |||
#define CROSSOVER 24 | |||
// individual crossover sizes | |||
#define CROSSOVER_XLAUUM CROSSOVER | |||
#define CROSSOVER_SLAUUM CROSSOVER_XLAUUM | |||
#define CROSSOVER_DLAUUM CROSSOVER_XLAUUM | |||
#define CROSSOVER_CLAUUM CROSSOVER_XLAUUM | |||
#define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM | |||
#define CROSSOVER_XSYGST CROSSOVER | |||
#define CROSSOVER_SSYGST CROSSOVER_XSYGST | |||
#define CROSSOVER_DSYGST CROSSOVER_XSYGST | |||
#define CROSSOVER_CHEGST CROSSOVER_XSYGST | |||
#define CROSSOVER_ZHEGST CROSSOVER_XSYGST | |||
#define CROSSOVER_XTRTRI CROSSOVER | |||
#define CROSSOVER_STRTRI CROSSOVER_XTRTRI | |||
#define CROSSOVER_DTRTRI CROSSOVER_XTRTRI | |||
#define CROSSOVER_CTRTRI CROSSOVER_XTRTRI | |||
#define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI | |||
#define CROSSOVER_XPOTRF CROSSOVER | |||
#define CROSSOVER_SPOTRF CROSSOVER_XPOTRF | |||
#define CROSSOVER_DPOTRF CROSSOVER_XPOTRF | |||
#define CROSSOVER_CPOTRF CROSSOVER_XPOTRF | |||
#define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF | |||
#define CROSSOVER_XPBTRF CROSSOVER | |||
#define CROSSOVER_SPBTRF CROSSOVER_XPBTRF | |||
#define CROSSOVER_DPBTRF CROSSOVER_XPBTRF | |||
#define CROSSOVER_CPBTRF CROSSOVER_XPBTRF | |||
#define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF | |||
#define CROSSOVER_XSYTRF CROSSOVER | |||
#define CROSSOVER_SSYTRF CROSSOVER_XSYTRF | |||
#define CROSSOVER_DSYTRF CROSSOVER_XSYTRF | |||
#define CROSSOVER_CSYTRF CROSSOVER_XSYTRF | |||
#define CROSSOVER_CHETRF CROSSOVER_XSYTRF | |||
#define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF | |||
#define CROSSOVER_ZHETRF CROSSOVER_XSYTRF | |||
#define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF | |||
#define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF | |||
#define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF | |||
#define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF | |||
#define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF | |||
#define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF | |||
#define CROSSOVER_XGETRF CROSSOVER | |||
#define CROSSOVER_SGETRF CROSSOVER_XGETRF | |||
#define CROSSOVER_DGETRF CROSSOVER_XGETRF | |||
#define CROSSOVER_CGETRF CROSSOVER_XGETRF | |||
#define CROSSOVER_ZGETRF CROSSOVER_XGETRF | |||
#define CROSSOVER_XGBTRF CROSSOVER | |||
#define CROSSOVER_SGBTRF CROSSOVER_XGBTRF | |||
#define CROSSOVER_DGBTRF CROSSOVER_XGBTRF | |||
#define CROSSOVER_CGBTRF CROSSOVER_XGBTRF | |||
#define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF | |||
#define CROSSOVER_XTRSYL CROSSOVER | |||
#define CROSSOVER_STRSYL CROSSOVER_XTRSYL | |||
#define CROSSOVER_DTRSYL CROSSOVER_XTRSYL | |||
#define CROSSOVER_CTRSYL CROSSOVER_XTRSYL | |||
#define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL | |||
#define CROSSOVER_XTGSYL CROSSOVER | |||
#define CROSSOVER_STGSYL CROSSOVER_XTGSYL | |||
#define CROSSOVER_DTGSYL CROSSOVER_XTGSYL | |||
#define CROSSOVER_CTGSYL CROSSOVER_XTGSYL | |||
#define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL | |||
// sytrf helper routine | |||
#define CROSSOVER_XGEMMT CROSSOVER_XSYTRF | |||
#define CROSSOVER_SGEMMT CROSSOVER_XGEMMT | |||
#define CROSSOVER_DGEMMT CROSSOVER_XGEMMT | |||
#define CROSSOVER_CGEMMT CROSSOVER_XGEMMT | |||
#define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT | |||
#endif /* RELAPACK_CONFIG_H */ |
@@ -0,0 +1,87 @@ | |||
RELAPACK Configuration | |||
====================== | |||
ReLAPACK has two configuration files: `make.inc`, which is included by the | |||
Makefile, and `config.h` which is included in the source files. | |||
Build and Testing Environment | |||
----------------------------- | |||
The build environment (compiler and flags) and the test configuration (linker | |||
flags for BLAS and LAPACK) are specified in `make.inc`. The test matrix size | |||
and error bounds are defined in `test/config.h`. | |||
The library `librelapack.a` is compiled by invoking `make`. The tests are | |||
performed by either `make test` or calling `make` in the test folder. | |||
BLAS/LAPACK complex function interfaces | |||
--------------------------------------- | |||
For BLAS and LAPACK functions that return a complex number, there exist two | |||
conflicting (FORTRAN compiler dependent) calling conventions: either the result | |||
is returned as a `struct` of two floating point numbers or an additional first | |||
argument with a pointer to such a `struct` is used. By default ReLAPACK uses | |||
the former (which is what gfortran uses), but it can switch to the latter by | |||
setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK | |||
specific counterparts) to `1` in `config.h`. | |||
**For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.** | |||
(Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases | |||
will segfault or return errors on the order of 1 or larger.) | |||
BLAS extension `xgemmt` | |||
----------------------- | |||
The LDL decompositions require a general matrix-matrix product that updates only | |||
a triangular matrix called `xgemmt`. If the BLAS implementation linked against | |||
provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`; | |||
otherwise, ReLAPACK uses its own recursive implementation of these kernels. | |||
`xgemmt` is provided by MKL. | |||
Routine Selection | |||
----------------- | |||
ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`). If the | |||
corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to | |||
`1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g., | |||
`dgetrf_`). By default, wrappers for all routines are enabled. | |||
Crossover Size | |||
-------------- | |||
The crossover size determines below which matrix sizes ReLAPACK's recursive | |||
algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3 | |||
routines. The crossover size is set in `config.h` and can be chosen either | |||
globally for the entire library, by operation, or individually by routine. | |||
Allowing Temporary Buffers | |||
-------------------------- | |||
Two of ReLAPACK's routines make use of temporary buffers, which are allocated | |||
and freed within ReLAPACK. Setting `ALLOW_MALLOC` (or one of the routine | |||
specific counterparts) to 0 in `config.h` will disable these buffers. The | |||
affected routines are: | |||
* `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2. As in | |||
LAPACK, this size can be queried by setting `lWork = -1` and the passed | |||
buffer will be used if it is large enough; only if it is not, a local buffer | |||
will be allocated. | |||
The advantage of this mechanism is that ReLAPACK will seamlessly work even | |||
with codes that statically provide too little memory instead of breaking | |||
them. | |||
* `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem | |||
to standard form can use an auxiliary buffer of size n^2 / 2 to avoid | |||
redundant computations. It thereby performs about 30% less FLOPs than | |||
LAPACK. | |||
FORTRAN symbol names | |||
-------------------- | |||
ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces. | |||
Since these libraries usually have an underscore to their symbol names, ReLAPACK | |||
has configuration switches in `config.h` to adjust the corresponding routine | |||
names. |
@@ -0,0 +1,212 @@ | |||
Coverage of ReLAPACK | |||
==================== | |||
This file lists all LAPACK compute routines that are covered by recursive | |||
algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which | |||
are not (yet) part of ReLAPACK. | |||
<!-- START doctoc generated TOC please keep comment here to allow auto update --> | |||
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE --> | |||
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* | |||
- [List of covered LAPACK routines](#list-of-covered-lapack-routines) | |||
- [`xlauum`](#xlauum) | |||
- [`xsygst`](#xsygst) | |||
- [`xtrtri`](#xtrtri) | |||
- [`xpotrf`](#xpotrf) | |||
- [`xpbtrf`](#xpbtrf) | |||
- [`xsytrf`](#xsytrf) | |||
- [`xgetrf`](#xgetrf) | |||
- [`xgbtrf`](#xgbtrf) | |||
- [`xtrsyl`](#xtrsyl) | |||
- [`xtgsyl`](#xtgsyl) | |||
- [Covered BLAS extension](#covered-blas-extension) | |||
- [`xgemmt`](#xgemmt) | |||
- [Not covered yet](#not-covered-yet) | |||
- [`xpstrf`](#xpstrf) | |||
- [Not covered: extra FLOPs](#not-covered-extra-flops) | |||
- [QR decomposition (and related)](#qr-decomposition-and-related) | |||
- [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal) | |||
- [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal) | |||
- [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg) | |||
<!-- END doctoc generated TOC please keep comment here to allow auto update --> | |||
List of covered LAPACK routines | |||
------------------------------- | |||
### `xlauum` | |||
Multiplication of a triangular matrix with its (complex conjugate) transpose, | |||
resulting in a symmetric (Hermitian) matrix. | |||
Routines: `slauum`, `dlauum`, `clauum`, `zlauum` | |||
Operations: | |||
* A = L^T L | |||
* A = U U^T | |||
### `xsygst` | |||
Simultaneous two-sided multiplication of a symmetric matrix with a triangular | |||
matrix and its transpose | |||
Routines: `ssygst`, `dsygst`, `chegst`, `zhegst` | |||
Operations: | |||
* A = inv(L) A inv(L^T) | |||
* A = inv(U^T) A inv(U) | |||
* A = L^T A L | |||
* A = U A U^T | |||
### `xtrtri` | |||
Inversion of a triangular matrix | |||
Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri` | |||
Operations: | |||
* L = inv(L) | |||
* U = inv(U) | |||
### `xpotrf` | |||
Cholesky decomposition of a symmetric (Hermitian) positive definite matrix | |||
Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf` | |||
Operations: | |||
* L L^T = A | |||
* U^T U = A | |||
### `xpbtrf` | |||
Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix | |||
Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf` | |||
Operations: | |||
* L L^T = A | |||
* U^T U = A | |||
### `xsytrf` | |||
LDL decomposition of a symmetric (or Hermitian) matrix | |||
Routines: | |||
* `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`, | |||
* `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`, | |||
`zhetrf_rook` | |||
Operations: | |||
* L D L^T = A | |||
* U^T D U = A | |||
### `xgetrf` | |||
LU decomposition of a general matrix with pivoting | |||
Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf` | |||
Operation: P L U = A | |||
### `xgbtrf` | |||
LU decomposition of a general banded matrix with pivoting | |||
Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf` | |||
Operation: L U = A | |||
### `xtrsyl` | |||
Solution of the quasi-triangular Sylvester equation | |||
Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl` | |||
Operations: | |||
* A X + B Y = C -> X | |||
* A^T X + B Y = C -> X | |||
* A X + B^T Y = C -> X | |||
* A^T X + B^T Y = C -> X | |||
* A X - B Y = C -> X | |||
* A^T X - B Y = C -> X | |||
* A X - B^T Y = C -> X | |||
* A^T X - B^T Y = C -> X | |||
### `xtgsyl` | |||
Solution of the generalized Sylvester equations | |||
Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl` | |||
Operations: | |||
* A R - L B = C, D R - L E = F -> L, R | |||
* A^T R + D^T L = C, R B^T - L E^T = -F -> L, R | |||
Covered BLAS extension | |||
---------------------- | |||
### `xgemmt` | |||
Matrix-matrix product updating only a triangular part of the result | |||
Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt` | |||
Operations: | |||
* C = alpha A B + beta C | |||
* C = alpha A B^T + beta C | |||
* C = alpha A^T B + beta C | |||
* C = alpha A^T B^T + beta C | |||
Not covered yet | |||
--------------- | |||
The following operation is implemented as a blocked algorithm in LAPACK but | |||
currently not yet covered in ReLAPACK as a recursive algorithm | |||
### `xpstrf` | |||
Cholesky decomposition of a positive semi-definite matrix with complete pivoting. | |||
Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf` | |||
Operations: | |||
* P L L^T P^T = A | |||
* P U^T U P^T = A | |||
Not covered: extra FLOPs | |||
------------------------ | |||
The following routines are not covered because recursive variants would require | |||
considerably more FLOPs or operate on banded matrices. | |||
### QR decomposition (and related) | |||
Routines: | |||
* `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf` | |||
* `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf` | |||
* `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf` | |||
* `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf` | |||
* `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf` | |||
Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A | |||
Routines for multiplication with Q: | |||
* `sormqr`, `dormqr`, `cunmqr`, `zunmqr` | |||
* `sormrq`, `dormrq`, `cunmrq`, `zunmrq` | |||
* `sormql`, `dormql`, `cunmql`, `zunmql` | |||
* `sormlq`, `dormlq`, `cunmlq`, `zunmlq` | |||
* `sormrz`, `dormrz`, `cunmrz`, `zunmrz` | |||
Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T | |||
Routines for construction of Q: | |||
* `sorgqr`, `dorgqr`, `cungqr`, `zungqr` | |||
* `sorgrq`, `dorgrq`, `cungrq`, `zungrq` | |||
* `sorgql`, `dorgql`, `cungql`, `zungql` | |||
* `sorglq`, `dorglq`, `cunglq`, `zunglq` | |||
### Symmetric reduction to tridiagonal | |||
Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd` | |||
Operation: Q T Q^T = A | |||
### Symmetric reduction to bidiagonal | |||
Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd` | |||
Operation: Q T P^T = A | |||
### Reduction to upper Hessenberg | |||
Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd` | |||
Operation: Q H Q^T = A |
@@ -0,0 +1,67 @@ | |||
#ifndef RELAPACK_H | |||
#define RELAPACK_H | |||
void RELAPACK_slauum(const char *, const int *, float *, const int *, int *); | |||
void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *); | |||
void RELAPACK_clauum(const char *, const int *, float *, const int *, int *); | |||
void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *); | |||
void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *); | |||
void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *); | |||
void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *); | |||
void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *); | |||
void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *); | |||
void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *); | |||
void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *); | |||
void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *); | |||
void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *); | |||
void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *); | |||
void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *); | |||
void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *); | |||
void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *); | |||
void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *); | |||
void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *); | |||
void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *); | |||
void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); | |||
void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); | |||
void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); | |||
void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); | |||
void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
#endif /* RELAPACK_H */ |
@@ -0,0 +1,61 @@ | |||
#ifndef BLAS_H | |||
#define BLAS_H | |||
extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *); | |||
extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *); | |||
extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *); | |||
extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *); | |||
extern void BLAS(sscal)(const int *, const float *, float *, const int *); | |||
extern void BLAS(dscal)(const int *, const double *, double *, const int *); | |||
extern void BLAS(cscal)(const int *, const float *, float *, const int *); | |||
extern void BLAS(zscal)(const int *, const double *, double *, const int *); | |||
extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *); | |||
extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *); | |||
extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *); | |||
extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *); | |||
extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); | |||
extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); | |||
extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); | |||
extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); | |||
extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
#if HAVE_XGEMMT | |||
extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
#endif | |||
#endif /* BLAS_H */ |
@@ -0,0 +1,230 @@ | |||
#include "relapack.h" | |||
#include "stdlib.h" | |||
static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, | |||
const int *, float *, const int *, int *, float *, const int *, float *, | |||
const int *, int *); | |||
/** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. | |||
* | |||
* This routine is functionally equivalent to LAPACK's cgbtrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html | |||
* */ | |||
void RELAPACK_cgbtrf( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
float *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
// Check arguments | |||
*info = 0; | |||
if (*m < 0) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*kl < 0) | |||
*info = -3; | |||
else if (*ku < 0) | |||
*info = -4; | |||
else if (*ldAb < 2 * *kl + *ku + 1) | |||
*info = -6; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CGBTRF", &minfo); | |||
return; | |||
} | |||
// Constant | |||
const float ZERO[] = { 0., 0. }; | |||
// Result upper band width | |||
const int kv = *ku + *kl; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
float *const A = Ab + 2 * kv; | |||
// Zero upper diagonal fill-in elements | |||
int i, j; | |||
for (j = 0; j < *n; j++) { | |||
float *const A_j = A + 2 * *ldA * j; | |||
for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
A_j[2 * i] = A_j[2 * i + 1] = 0.; | |||
} | |||
// Allocate work space | |||
const int n1 = CREC_SPLIT(*n); | |||
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
const int nWorkl = (kv > n1) ? n1 : kv; | |||
const int mWorku = (*kl > n1) ? n1 : *kl; | |||
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); | |||
float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); | |||
LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
// Recursive kernel | |||
RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
// Free work space | |||
free(Workl); | |||
free(Worku); | |||
} | |||
/** cgbtrf's recursive compute kernel */ | |||
static void RELAPACK_cgbtrf_rec( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
float *Ab, const int *ldAb, int *ipiv, | |||
float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { | |||
// Unblocked | |||
LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterators | |||
int i, j; | |||
// Output upper band width | |||
const int kv = *ku + *kl; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
float *const A = Ab + 2 * kv; | |||
// Splitting | |||
const int n1 = MIN(CREC_SPLIT(*n), *kl); | |||
const int n2 = *n - n1; | |||
const int m1 = MIN(n1, *m); | |||
const int m2 = *m - m1; | |||
const int mn1 = MIN(m1, n1); | |||
const int mn2 = MIN(m2, n2); | |||
// Ab_L * | |||
// Ab_BR | |||
float *const Ab_L = Ab; | |||
float *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
// A_L A_R | |||
float *const A_L = A; | |||
float *const A_R = A + 2 * *ldA * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + 2 * *ldA * n1; | |||
float *const A_BL = A + 2 * m1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * m1; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_T = ipiv; | |||
int *const ipiv_B = ipiv + n1; | |||
// Banded splitting | |||
const int n21 = MIN(n2, kv - n1); | |||
const int n22 = MIN(n2 - n21, n1); | |||
const int m21 = MIN(m2, *kl - m1); | |||
const int m22 = MIN(m2 - m21, m1); | |||
// n1 n21 n22 | |||
// m * A_Rl ARr | |||
float *const A_Rl = A_R; | |||
float *const A_Rr = A_R + 2 * *ldA * n21; | |||
// n1 n21 n22 | |||
// m1 * A_TRl A_TRr | |||
// m21 A_BLt A_BRtl A_BRtr | |||
// m22 A_BLb A_BRbl A_BRbr | |||
float *const A_TRl = A_TR; | |||
float *const A_TRr = A_TR + 2 * *ldA * n21; | |||
float *const A_BLt = A_BL; | |||
float *const A_BLb = A_BL + 2 * m21; | |||
float *const A_BRtl = A_BR; | |||
float *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
float *const A_BRbl = A_BR + 2 * m21; | |||
float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21; | |||
// recursion(Ab_L, ipiv_T) | |||
RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
// Workl = A_BLb | |||
LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
// partially redo swaps in A_L | |||
for (i = 0; i < mn1; i++) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
if (ip < *kl) | |||
BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
else | |||
BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
} | |||
} | |||
// apply pivots to A_Rl | |||
LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
// apply pivots to A_Rr columnwise | |||
for (j = 0; j < n22; j++) { | |||
float *const A_Rrj = A_Rr + 2 * *ldA * j; | |||
for (i = j; i < mn1; i++) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
const float tmpr = A_Rrj[2 * i]; | |||
const float tmpc = A_Rrj[2 * i + 1]; | |||
A_Rrj[2 * i] = A_Rrj[2 * ip]; | |||
A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1]; | |||
A_Rrj[2 * ip] = tmpr; | |||
A_Rrj[2 * ip + 1] = tmpc; | |||
} | |||
} | |||
} | |||
// A_TRl = A_TL \ A_TRl | |||
BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
// Worku = A_TRr | |||
LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
// Worku = A_TL \ Worku | |||
BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
// A_TRr = Worku | |||
LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
// A_BRtl = A_BRtl - A_BLt * A_TRl | |||
BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
// A_BRbl = A_BRbl - Workl * A_TRl | |||
BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
// A_BRtr = A_BRtr - A_BLt * Worku | |||
BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
// A_BRbr = A_BRbr - Workl * Worku | |||
BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
// partially undo swaps in A_L | |||
for (i = mn1 - 1; i >= 0; i--) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
if (ip < *kl) | |||
BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
else | |||
BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
} | |||
} | |||
// recursion(Ab_BR, ipiv_B) | |||
RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
if (*info) | |||
*info += n1; | |||
// shift pivots | |||
for (i = 0; i < mn2; i++) | |||
ipiv_B[i] += n1; | |||
} |
@@ -0,0 +1,167 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_cgemmt_rec(const char *, const char *, const char *, | |||
const int *, const int *, const float *, const float *, const int *, | |||
const float *, const int *, const float *, float *, const int *); | |||
static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, | |||
const int *, const int *, const float *, const float *, const int *, | |||
const float *, const int *, const float *, float *, const int *); | |||
/** CGEMMT computes a matrix-matrix product with general matrices but updates | |||
* only the upper or lower triangular part of the result matrix. | |||
* | |||
* This routine performs the same operation as the BLAS routine | |||
* cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
* but only updates the triangular part of C specified by uplo: | |||
* If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
* otherwise the upper triangular part is updated. | |||
* */ | |||
void RELAPACK_cgemmt( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
#if HAVE_XGEMMT | |||
BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
return; | |||
#else | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
const int notransA = LAPACK(lsame)(transA, "N"); | |||
const int tranA = LAPACK(lsame)(transA, "T"); | |||
const int ctransA = LAPACK(lsame)(transA, "C"); | |||
const int notransB = LAPACK(lsame)(transB, "N"); | |||
const int tranB = LAPACK(lsame)(transB, "T"); | |||
const int ctransB = LAPACK(lsame)(transB, "C"); | |||
int info = 0; | |||
if (!lower && !upper) | |||
info = 1; | |||
else if (!tranA && !ctransA && !notransA) | |||
info = 2; | |||
else if (!tranB && !ctransB && !notransB) | |||
info = 3; | |||
else if (*n < 0) | |||
info = 4; | |||
else if (*k < 0) | |||
info = 5; | |||
else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
info = 8; | |||
else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
info = 10; | |||
else if (*ldC < MAX(1, *n)) | |||
info = 13; | |||
if (info) { | |||
LAPACK(xerbla)("CGEMMT", &info); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C'); | |||
const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C'); | |||
// Recursive kernel | |||
RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
#endif | |||
} | |||
/** cgemmt's recursive compute kernel */ | |||
static void RELAPACK_cgemmt_rec( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
if (*n <= MAX(CROSSOVER_CGEMMT, 1)) { | |||
// Unblocked | |||
RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
return; | |||
} | |||
// Splitting | |||
const int n1 = CREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_T | |||
// A_B | |||
const float *const A_T = A; | |||
const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1); | |||
// B_L B_R | |||
const float *const B_L = B; | |||
const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1); | |||
// C_TL C_TR | |||
// C_BL C_BR | |||
float *const C_TL = C; | |||
float *const C_TR = C + 2 * *ldC * n1; | |||
float *const C_BL = C + 2 * n1; | |||
float *const C_BR = C + 2 * *ldC * n1 + 2 * n1; | |||
// recursion(C_TL) | |||
RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
if (*uplo == 'L') | |||
// C_BL = alpha A_B B_L + beta C_BL | |||
BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
else | |||
// C_TR = alpha A_T B_R + beta C_TR | |||
BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
// recursion(C_BR) | |||
RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
} | |||
/** cgemmt's unblocked compute kernel */ | |||
static void RELAPACK_cgemmt_rec2( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
const int incB = (*transB == 'N') ? 1 : *ldB; | |||
const int incC = 1; | |||
int i; | |||
for (i = 0; i < *n; i++) { | |||
// A_0 | |||
// A_i | |||
const float *const A_0 = A; | |||
const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i); | |||
// * B_i * | |||
const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i); | |||
// * C_0i * | |||
// * C_ii * | |||
float *const C_0i = C + 2 * *ldC * i; | |||
float *const C_ii = C + 2 * *ldC * i + 2 * i; | |||
if (*uplo == 'L') { | |||
const int nmi = *n - i; | |||
if (*transA == 'N') | |||
BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
else | |||
BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
} else { | |||
const int ip1 = i + 1; | |||
if (*transA == 'N') | |||
BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
else | |||
BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
} | |||
} | |||
} |
@@ -0,0 +1,117 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_cgetrf_rec(const int *, const int *, float *, | |||
const int *, int *, int *); | |||
/** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
* | |||
* This routine is functionally equivalent to LAPACK's cgetrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html | |||
*/ | |||
void RELAPACK_cgetrf( | |||
const int *m, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
// Check arguments | |||
*info = 0; | |||
if (*m < 0) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CGETRF", &minfo); | |||
return; | |||
} | |||
const int sn = MIN(*m, *n); | |||
RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
// Right remainder | |||
if (*m < *n) { | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const int iONE[] = { 1 }; | |||
// Splitting | |||
const int rn = *n - *m; | |||
// A_L A_R | |||
const float *const A_L = A; | |||
float *const A_R = A + 2 * *ldA * *m; | |||
// A_R = apply(ipiv, A_R) | |||
LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
// A_R = A_L \ A_R | |||
BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
} | |||
} | |||
/** cgetrf's recursive compute kernel */ | |||
static void RELAPACK_cgetrf_rec( | |||
const int *m, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_CGETRF, 1)) { | |||
// Unblocked | |||
LAPACK(cgetf2)(m, n, A, ldA, ipiv, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
// Splitting | |||
const int n1 = CREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
const int m2 = *m - n1; | |||
// A_L A_R | |||
float *const A_L = A; | |||
float *const A_R = A + 2 * *ldA * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + 2 * *ldA * n1; | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_T = ipiv; | |||
int *const ipiv_B = ipiv + n1; | |||
// recursion(A_L, ipiv_T) | |||
RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
// apply pivots to A_R | |||
LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
// A_TR = A_TL \ A_TR | |||
BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
// A_BR = A_BR - A_BL * A_TR | |||
BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
// recursion(A_BR, ipiv_B) | |||
RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
if (*info) | |||
*info += n1; | |||
// apply pivots to A_BL | |||
LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
ipiv_B[i] += n1; | |||
} |
@@ -0,0 +1,212 @@ | |||
#include "relapack.h" | |||
#if XSYGST_ALLOW_MALLOC | |||
#include "stdlib.h" | |||
#endif | |||
static void RELAPACK_chegst_rec(const int *, const char *, const int *, | |||
float *, const int *, const float *, const int *, | |||
float *, const int *, int *); | |||
/** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. | |||
* | |||
* This routine is functionally equivalent to LAPACK's chegst. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html | |||
* */ | |||
void RELAPACK_chegst( | |||
const int *itype, const char *uplo, const int *n, | |||
float *A, const int *ldA, const float *B, const int *ldB, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (*itype < 1 || *itype > 3) | |||
*info = -1; | |||
else if (!lower && !upper) | |||
*info = -2; | |||
else if (*n < 0) | |||
*info = -3; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -5; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -7; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CHEGST", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Allocate work space | |||
float *Work = NULL; | |||
int lWork = 0; | |||
#if XSYGST_ALLOW_MALLOC | |||
const int n1 = CREC_SPLIT(*n); | |||
lWork = n1 * (*n - n1); | |||
Work = malloc(lWork * 2 * sizeof(float)); | |||
if (!Work) | |||
lWork = 0; | |||
#endif | |||
// recursive kernel | |||
RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); | |||
// Free work space | |||
#if XSYGST_ALLOW_MALLOC | |||
if (Work) | |||
free(Work); | |||
#endif | |||
} | |||
/** chegst's recursive compute kernel */ | |||
static void RELAPACK_chegst_rec( | |||
const int *itype, const char *uplo, const int *n, | |||
float *A, const int *ldA, const float *B, const int *ldB, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_CHEGST, 1)) { | |||
// Unblocked | |||
LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info); | |||
return; | |||
} | |||
// Constants | |||
const float ZERO[] = { 0., 0. }; | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
const float HALF[] = { .5, 0. }; | |||
const float MHALF[] = { -.5, 0. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterator | |||
int i; | |||
// Splitting | |||
const int n1 = CREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + 2 * *ldA * n1; | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// B_TL B_TR | |||
// B_BL B_BR | |||
const float *const B_TL = B; | |||
const float *const B_TR = B + 2 * *ldB * n1; | |||
const float *const B_BL = B + 2 * n1; | |||
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
// recursion(A_TL, B_TL) | |||
RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); | |||
if (*itype == 1) | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL / B_TL' | |||
BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = -1/2 * B_BL * A_TL | |||
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
} else | |||
// A_BL = A_BL - 1/2 B_BL * A_TL | |||
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' | |||
BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
else | |||
// A_BL = A_BL - 1/2 B_BL * A_TL | |||
BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BL = B_BR \ A_BL | |||
BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
} else { | |||
// A_TR = B_TL' \ A_TR | |||
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = -1/2 * A_TL * B_TR | |||
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
// A_TR = A_BL + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
} else | |||
// A_TR = A_TR - 1/2 A_TL * B_TR | |||
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR | |||
BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_TR = A_BL + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
else | |||
// A_TR = A_TR - 1/2 A_TL * B_TR | |||
BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TR = A_TR / B_BR | |||
BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
} | |||
else | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL * B_TL | |||
BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = 1/2 * A_BR * B_BL | |||
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
} else | |||
// A_BL = A_BL + 1/2 A_BR * B_BL | |||
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL | |||
BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
else | |||
// A_BL = A_BL + 1/2 A_BR * B_BL | |||
BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BL = B_BR * A_BL | |||
BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
} else { | |||
// A_TR = B_TL * A_TR | |||
BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = 1/2 * B_TR * A_BR | |||
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
// A_TR = A_TR + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
} else | |||
// A_TR = A_TR + 1/2 B_TR A_BR | |||
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' | |||
BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_TR = A_TR + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
else | |||
// A_TR = A_TR + 1/2 B_TR * A_BR | |||
BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TR = A_TR * B_BR | |||
BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
} | |||
// recursion(A_BR, B_BR) | |||
RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); | |||
} |
@@ -0,0 +1,236 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, | |||
float *, const int *, int *, float *, const int *, int *); | |||
/** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's chetrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html | |||
* */ | |||
void RELAPACK_chetrf( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
float *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CHETRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy argument | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** chetrf's recursive compute kernel */ | |||
static void RELAPACK_chetrf_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_CHETRF, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = CREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
float *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
float *const A_BL_B = A + 2 * *n; | |||
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
float *const Work_BL = Work + 2 * n1; | |||
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
// last row of A_BL | |||
float *const A_BL_b = A_BL + 2 * n2_out; | |||
// last row of Work_BL | |||
float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = CREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
float *const Work_L = Work; | |||
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,520 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static complex c_b1 = {1.f,0.f}; | |||
static int c__1 = 1; | |||
/** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method | |||
* | |||
* This routine is a minor modification of LAPACK's clahef. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int * | |||
nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, | |||
int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
float r__1, r__2, r__3, r__4; | |||
complex q__1, q__2, q__3, q__4; | |||
/* Builtin functions */ | |||
double sqrt(double), r_imag(complex *); | |||
void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); | |||
/* Local variables */ | |||
static int j, k; | |||
static float t, r1; | |||
static complex d11, d21, d22; | |||
static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
static float alpha; | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
, complex *, int *, complex *, int *, complex *, complex * | |||
, int *, ftnlen), ccopy_(int *, complex *, int *, | |||
complex *, int *), cswap_(int *, complex *, int *, | |||
complex *, int *); | |||
static int kstep; | |||
static float absakk; | |||
extern /* Subroutine */ int clacgv_(int *, complex *, int *); | |||
extern int icamax_(int *, complex *, int *); | |||
extern /* Subroutine */ int csscal_(int *, float *, complex *, int | |||
*); | |||
static float colmax, rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
kstep = 1; | |||
i__1 = k - 1; | |||
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = k + kw * w_dim1; | |||
i__2 = k + k * a_dim1; | |||
r__1 = a[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
i__1 = k + kw * w_dim1; | |||
i__2 = k + kw * w_dim1; | |||
r__1 = w[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
} | |||
i__1 = k + kw * w_dim1; | |||
absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = imax + kw * w_dim1; | |||
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
+ kw * w_dim1]), dabs(r__2)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * a_dim1; | |||
r__1 = a[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
i__1 = imax - 1; | |||
ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
i__2 = imax + imax * a_dim1; | |||
r__1 = a[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
i__1 = k - imax; | |||
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
i__1 = k - imax; | |||
clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
i__2 = imax + (kw - 1) * w_dim1; | |||
r__1 = w[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
} | |||
i__1 = k - imax; | |||
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
&c__1); | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
/* Computing MAX */ | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( | |||
r__2)); | |||
rowmax = dmax(r__3,r__4); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else /* if(complicated condition) */ { | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { | |||
kp = imax; | |||
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
r__1 = a[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
i__1 = kk - 1 - kp; | |||
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
i__1 = kk - 1 - kp; | |||
clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
if (kp > 1) { | |||
i__1 = kp - 1; | |||
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
+ 1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
+ 1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
if (k > 1) { | |||
i__1 = k + k * a_dim1; | |||
r1 = 1.f / a[i__1].r; | |||
i__1 = k - 1; | |||
csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
i__1 = k - 1; | |||
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
} | |||
} else { | |||
if (k > 2) { | |||
i__1 = k - 1 + kw * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
r_cnjg(&q__2, &d21); | |||
c_div(&q__1, &w[k + kw * w_dim1], &q__2); | |||
d11.r = q__1.r, d11.i = q__1.i; | |||
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
d22.r = q__1.r, d22.i = q__1.i; | |||
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
t = 1.f / (q__1.r - 1.f); | |||
q__2.r = t, q__2.i = 0.f; | |||
c_div(&q__1, &q__2, &d21); | |||
d21.r = q__1.r, d21.i = q__1.i; | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
i__2 = j + (k - 1) * a_dim1; | |||
i__3 = j + (kw - 1) * w_dim1; | |||
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + kw * w_dim1; | |||
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
.i; | |||
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
d21.r * q__2.i + d21.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
i__2 = j + k * a_dim1; | |||
r_cnjg(&q__2, &d21); | |||
i__3 = j + kw * w_dim1; | |||
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + (kw - 1) * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = | |||
q__2.r * q__3.i + q__2.i * q__3.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L20: */ | |||
} | |||
} | |||
i__1 = k - 1 + (k - 1) * a_dim1; | |||
i__2 = k - 1 + (kw - 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1 + k * a_dim1; | |||
i__2 = k - 1 + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1; | |||
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = k - 2; | |||
clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
++j; | |||
} | |||
++j; | |||
if (jp != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
} | |||
if (j <= *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
kstep = 1; | |||
i__1 = k + k * w_dim1; | |||
i__2 = k + k * a_dim1; | |||
r__1 = a[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
w_dim1], &c__1); | |||
} | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k | |||
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
i__1 = k + k * w_dim1; | |||
i__2 = k + k * w_dim1; | |||
r__1 = w[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
i__1 = k + k * w_dim1; | |||
absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = imax + k * w_dim1; | |||
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
+ k * w_dim1]), dabs(r__2)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * a_dim1; | |||
r__1 = a[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
i__1 = imax - k; | |||
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = imax - k; | |||
clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
i__1 = imax + (k + 1) * w_dim1; | |||
i__2 = imax + imax * a_dim1; | |||
r__1 = a[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
imax + 1 + (k + 1) * w_dim1], &c__1); | |||
} | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], | |||
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
w_dim1], &c__1, (ftnlen)12); | |||
i__1 = imax + (k + 1) * w_dim1; | |||
i__2 = imax + (k + 1) * w_dim1; | |||
r__1 = w[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
i__1 = imax - k; | |||
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
; | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
/* Computing MAX */ | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( | |||
r__2)); | |||
rowmax = dmax(r__3,r__4); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else /* if(complicated condition) */ { | |||
i__1 = imax + (k + 1) * w_dim1; | |||
if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
k * w_dim1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
r__1 = a[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
i__1 = kp - kk - 1; | |||
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
1) * a_dim1], lda); | |||
i__1 = kp - kk - 1; | |||
clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
if (kp < *n) { | |||
i__1 = *n - kp; | |||
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
+ kp * a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
} | |||
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
i__1 = k + k * a_dim1; | |||
r1 = 1.f / a[i__1].r; | |||
i__1 = *n - k; | |||
csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
i__1 = *n - k; | |||
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
i__1 = k + 1 + k * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
d11.r = q__1.r, d11.i = q__1.i; | |||
r_cnjg(&q__2, &d21); | |||
c_div(&q__1, &w[k + k * w_dim1], &q__2); | |||
d22.r = q__1.r, d22.i = q__1.i; | |||
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
t = 1.f / (q__1.r - 1.f); | |||
q__2.r = t, q__2.i = 0.f; | |||
c_div(&q__1, &q__2, &d21); | |||
d21.r = q__1.r, d21.i = q__1.i; | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
i__2 = j + k * a_dim1; | |||
r_cnjg(&q__2, &d21); | |||
i__3 = j + k * w_dim1; | |||
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + (k + 1) * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = | |||
q__2.r * q__3.i + q__2.i * q__3.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
i__2 = j + (k + 1) * a_dim1; | |||
i__3 = j + (k + 1) * w_dim1; | |||
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + k * w_dim1; | |||
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
.i; | |||
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
d21.r * q__2.i + d21.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L80: */ | |||
} | |||
} | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + k * a_dim1; | |||
i__2 = k + 1 + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + (k + 1) * a_dim1; | |||
i__2 = k + 1 + (k + 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = *n - k; | |||
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = *n - k - 1; | |||
clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
--j; | |||
} | |||
--j; | |||
if (jp != jj && j >= 1) { | |||
cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j >= 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,236 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *, | |||
float *, const int *, int *, float *, const int *, int *); | |||
/** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's chetrf_rook. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html | |||
* */ | |||
void RELAPACK_chetrf_rook( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
float *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CHETRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy argument | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** chetrf_rook's recursive compute kernel */ | |||
static void RELAPACK_chetrf_rook_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_CHETRF, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = CREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
float *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
float *const A_BL_B = A + 2 * *n; | |||
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
float *const Work_BL = Work + 2 * n1; | |||
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
// last row of A_BL | |||
float *const A_BL_b = A_BL + 2 * n2_out; | |||
// last row of Work_BL | |||
float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = CREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
float *const Work_L = Work; | |||
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,661 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static complex c_b1 = {1.f,0.f}; | |||
static int c__1 = 1; | |||
/** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method | |||
* | |||
* This routine is a minor modification of LAPACK's clahef_rook. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n, | |||
int *nb, int *kb, complex *a, int *lda, int *ipiv, | |||
complex *w, int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
float r__1, r__2; | |||
complex q__1, q__2, q__3, q__4, q__5; | |||
/* Builtin functions */ | |||
double sqrt(double), r_imag(complex *); | |||
void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); | |||
/* Local variables */ | |||
static int j, k, p; | |||
static float t, r1; | |||
static complex d11, d21, d22; | |||
static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
static logical done; | |||
static int imax, jmax; | |||
static float alpha; | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
, complex *, int *, complex *, int *, complex *, complex * | |||
, int *, ftnlen); | |||
static float sfmin; | |||
extern /* Subroutine */ int ccopy_(int *, complex *, int *, | |||
complex *, int *); | |||
static int itemp; | |||
extern /* Subroutine */ int cswap_(int *, complex *, int *, | |||
complex *, int *); | |||
static int kstep; | |||
static float stemp, absakk; | |||
extern /* Subroutine */ int clacgv_(int *, complex *, int *); | |||
extern int icamax_(int *, complex *, int *); | |||
extern double slamch_(char *, ftnlen); | |||
extern /* Subroutine */ int csscal_(int *, float *, complex *, int | |||
*); | |||
static float colmax, rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
sfmin = slamch_("S", (ftnlen)1); | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
kstep = 1; | |||
p = k; | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], & | |||
c__1); | |||
} | |||
i__1 = k + kw * w_dim1; | |||
i__2 = k + k * a_dim1; | |||
r__1 = a[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
i__1 = k + kw * w_dim1; | |||
i__2 = k + kw * w_dim1; | |||
r__1 = w[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
} | |||
i__1 = k + kw * w_dim1; | |||
absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = imax + kw * w_dim1; | |||
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
+ kw * w_dim1]), dabs(r__2)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
r__1 = w[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], | |||
&c__1); | |||
} | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L12: | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
} | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
i__2 = imax + imax * a_dim1; | |||
r__1 = a[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
i__1 = k - imax; | |||
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
i__1 = k - imax; | |||
clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
i__2 = imax + (kw - 1) * w_dim1; | |||
r__1 = w[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
} | |||
if (imax != k) { | |||
i__1 = k - imax; | |||
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
w_dim1], &c__1); | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
} else { | |||
rowmax = 0.f; | |||
} | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
i__1 = itemp + (kw - 1) * w_dim1; | |||
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); | |||
if (stemp > rowmax) { | |||
rowmax = stemp; | |||
jmax = itemp; | |||
} | |||
} | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { | |||
kp = imax; | |||
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} | |||
if (! done) { | |||
goto L12; | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kstep == 2 && p != k) { | |||
i__1 = p + p * a_dim1; | |||
i__2 = k + k * a_dim1; | |||
r__1 = a[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
i__1 = k - 1 - p; | |||
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
a_dim1], lda); | |||
i__1 = k - 1 - p; | |||
clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda); | |||
if (p > 1) { | |||
i__1 = p - 1; | |||
ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + | |||
1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k + | |||
1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
ldw); | |||
} | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
r__1 = a[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
i__1 = kk - 1 - kp; | |||
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
i__1 = kk - 1 - kp; | |||
clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
if (kp > 1) { | |||
i__1 = kp - 1; | |||
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
+ 1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
+ 1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
if (k > 1) { | |||
i__1 = k + k * a_dim1; | |||
t = a[i__1].r; | |||
if (dabs(t) >= sfmin) { | |||
r1 = 1.f / t; | |||
i__1 = k - 1; | |||
csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
i__1 = k - 1; | |||
for (ii = 1; ii <= i__1; ++ii) { | |||
i__2 = ii + k * a_dim1; | |||
i__3 = ii + k * a_dim1; | |||
q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L14: */ | |||
} | |||
} | |||
i__1 = k - 1; | |||
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
} | |||
} else { | |||
if (k > 2) { | |||
i__1 = k - 1 + kw * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
r_cnjg(&q__2, &d21); | |||
c_div(&q__1, &w[k + kw * w_dim1], &q__2); | |||
d11.r = q__1.r, d11.i = q__1.i; | |||
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
d22.r = q__1.r, d22.i = q__1.i; | |||
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
t = 1.f / (q__1.r - 1.f); | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
i__2 = j + (k - 1) * a_dim1; | |||
i__3 = j + (kw - 1) * w_dim1; | |||
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + kw * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
c_div(&q__2, &q__3, &d21); | |||
q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + kw * w_dim1; | |||
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + (kw - 1) * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
r_cnjg(&q__5, &d21); | |||
c_div(&q__2, &q__3, &q__5); | |||
q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L20: */ | |||
} | |||
} | |||
i__1 = k - 1 + (k - 1) * a_dim1; | |||
i__2 = k - 1 + (kw - 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1 + k * a_dim1; | |||
i__2 = k - 1 + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1; | |||
clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = k - 2; | |||
clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
++j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
++j; | |||
if (jp2 != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
++jj; | |||
if (kstep == 2 && jp1 != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
if (j < *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
kstep = 1; | |||
p = k; | |||
i__1 = k + k * w_dim1; | |||
i__2 = k + k * a_dim1; | |||
r__1 = a[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
w_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & | |||
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
ftnlen)12); | |||
i__1 = k + k * w_dim1; | |||
i__2 = k + k * w_dim1; | |||
r__1 = w[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
} | |||
i__1 = k + k * w_dim1; | |||
absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = imax + k * w_dim1; | |||
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
+ k * w_dim1]), dabs(r__2)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
r__1 = w[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k * | |||
a_dim1], &c__1); | |||
} | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L72: | |||
i__1 = imax - k; | |||
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = imax - k; | |||
clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
i__1 = imax + (k + 1) * w_dim1; | |||
i__2 = imax + imax * a_dim1; | |||
r__1 = a[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
imax + 1 + (k + 1) * w_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] | |||
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
1) * w_dim1], &c__1, (ftnlen)12); | |||
i__1 = imax + (k + 1) * w_dim1; | |||
i__2 = imax + (k + 1) * w_dim1; | |||
r__1 = w[i__2].r; | |||
w[i__1].r = r__1, w[i__1].i = 0.f; | |||
} | |||
if (imax != k) { | |||
i__1 = imax - k; | |||
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
c__1); | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
w[jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
} else { | |||
rowmax = 0.f; | |||
} | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = itemp + (k + 1) * w_dim1; | |||
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
w[itemp + (k + 1) * w_dim1]), dabs(r__2)); | |||
if (stemp > rowmax) { | |||
rowmax = stemp; | |||
jmax = itemp; | |||
} | |||
} | |||
i__1 = imax + (k + 1) * w_dim1; | |||
if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
} | |||
if (! done) { | |||
goto L72; | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kstep == 2 && p != k) { | |||
i__1 = p + p * a_dim1; | |||
i__2 = k + k * a_dim1; | |||
r__1 = a[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
i__1 = p - k - 1; | |||
ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) * | |||
a_dim1], lda); | |||
i__1 = p - k - 1; | |||
clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda); | |||
if (p < *n) { | |||
i__1 = *n - p; | |||
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p | |||
* a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
} | |||
cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
} | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
r__1 = a[i__2].r; | |||
a[i__1].r = r__1, a[i__1].i = 0.f; | |||
i__1 = kp - kk - 1; | |||
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
1) * a_dim1], lda); | |||
i__1 = kp - kk - 1; | |||
clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
if (kp < *n) { | |||
i__1 = *n - kp; | |||
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
+ kp * a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
} | |||
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
i__1 = k + k * a_dim1; | |||
t = a[i__1].r; | |||
if (dabs(t) >= sfmin) { | |||
r1 = 1.f / t; | |||
i__1 = *n - k; | |||
csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} else { | |||
i__1 = *n; | |||
for (ii = k + 1; ii <= i__1; ++ii) { | |||
i__2 = ii + k * a_dim1; | |||
i__3 = ii + k * a_dim1; | |||
q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L74: */ | |||
} | |||
} | |||
i__1 = *n - k; | |||
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
i__1 = k + 1 + k * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
d11.r = q__1.r, d11.i = q__1.i; | |||
r_cnjg(&q__2, &d21); | |||
c_div(&q__1, &w[k + k * w_dim1], &q__2); | |||
d22.r = q__1.r, d22.i = q__1.i; | |||
q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
t = 1.f / (q__1.r - 1.f); | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + k * w_dim1; | |||
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + (k + 1) * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
r_cnjg(&q__5, &d21); | |||
c_div(&q__2, &q__3, &q__5); | |||
q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
i__2 = j + (k + 1) * a_dim1; | |||
i__3 = j + (k + 1) * w_dim1; | |||
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + k * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
c_div(&q__2, &q__3, &d21); | |||
q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L80: */ | |||
} | |||
} | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + k * a_dim1; | |||
i__2 = k + 1 + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + (k + 1) * a_dim1; | |||
i__2 = k + 1 + (k + 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = *n - k; | |||
clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = *n - k - 1; | |||
clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
--j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
--j; | |||
if (jp2 != jj && j >= 1) { | |||
cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
--jj; | |||
if (kstep == 2 && jp1 != jj && j >= 1) { | |||
cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j > 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,87 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_clauum_rec(const char *, const int *, float *, | |||
const int *, int *); | |||
/** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's clauum. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html | |||
* */ | |||
void RELAPACK_clauum( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CLAUUM", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Recursive kernel | |||
RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info); | |||
} | |||
/** clauum's recursive compute kernel */ | |||
static void RELAPACK_clauum_rec( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_CLAUUM, 1)) { | |||
// Unblocked | |||
LAPACK(clauu2)(uplo, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
// Splitting | |||
const int n1 = CREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + 2 * *ldA * n1; | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// recursion(A_TL) | |||
RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info); | |||
if (*uplo == 'L') { | |||
// A_TL = A_TL + A_BL' * A_BL | |||
BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
// A_BL = A_BR' * A_BL | |||
BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
} else { | |||
// A_TL = A_TL + A_TR * A_TR' | |||
BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
// A_TR = A_TR * A_BR' | |||
BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info); | |||
} |
@@ -0,0 +1,157 @@ | |||
#include "relapack.h" | |||
#include "stdlib.h" | |||
static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, | |||
float *, const int *, float *, const int *, int *); | |||
/** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's cpbtrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html | |||
* */ | |||
void RELAPACK_cpbtrf( | |||
const char *uplo, const int *n, const int *kd, | |||
float *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*kd < 0) | |||
*info = -3; | |||
else if (*ldAb < *kd + 1) | |||
*info = -5; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CPBTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Constant | |||
const float ZERO[] = { 0., 0. }; | |||
// Allocate work space | |||
const int n1 = CREC_SPLIT(*n); | |||
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
float *Work = malloc(mWork * nWork * 2 * sizeof(float)); | |||
LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
// Recursive kernel | |||
RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
// Free work space | |||
free(Work); | |||
} | |||
/** cpbtrf's recursive compute kernel */ | |||
static void RELAPACK_cpbtrf_rec( | |||
const char *uplo, const int *n, const int *kd, | |||
float *Ab, const int *ldAb, | |||
float *Work, const int *ldWork, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { | |||
// Unblocked | |||
LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); | |||
// Splitting | |||
const int n1 = MIN(CREC_SPLIT(*n), *kd); | |||
const int n2 = *n - n1; | |||
// * * | |||
// * Ab_BR | |||
float *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + 2 * *ldA * n1; | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// recursion(A_TL) | |||
RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
// Banded splitting | |||
const int n21 = MIN(n2, *kd - n1); | |||
const int n22 = MIN(n2 - n21, *kd); | |||
// n1 n21 n22 | |||
// n1 * A_TRl A_TRr | |||
// n21 A_BLt A_BRtl A_BRtr | |||
// n22 A_BLb A_BRbl A_BRbr | |||
float *const A_TRl = A_TR; | |||
float *const A_TRr = A_TR + 2 * *ldA * n21; | |||
float *const A_BLt = A_BL; | |||
float *const A_BLb = A_BL + 2 * n21; | |||
float *const A_BRtl = A_BR; | |||
float *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
float *const A_BRbl = A_BR + 2 * n21; | |||
float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21; | |||
if (*uplo == 'L') { | |||
// A_BLt = ABLt / A_TL' | |||
BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
// A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
// Work = A_BLb | |||
LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
// Work = Work / A_TL' | |||
BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
// A_BRbl = A_BRbl - Work * A_BLt' | |||
BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
// A_BRbr = A_BRbr - Work * Work' | |||
BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
// A_BLb = Work | |||
LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
} else { | |||
// A_TRl = A_TL' \ A_TRl | |||
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
// A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
// Work = A_TRr | |||
LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
// Work = A_TL' \ Work | |||
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
// A_BRtr = A_BRtr - A_TRl' * Work | |||
BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
// A_BRbr = A_BRbr - Work' * Work | |||
BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
// A_TRr = Work | |||
LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
} | |||
// recursion(A_BR) | |||
if (*kd > n1) | |||
RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info); | |||
else | |||
RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,92 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_cpotrf_rec(const char *, const int *, float *, | |||
const int *, int *); | |||
/** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's cpotrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html | |||
* */ | |||
void RELAPACK_cpotrf( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CPOTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Recursive kernel | |||
RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info); | |||
} | |||
/** cpotrf's recursive compute kernel */ | |||
static void RELAPACK_cpotrf_rec( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { | |||
// Unblocked | |||
LAPACK(cpotf2)(uplo, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
// Splitting | |||
const int n1 = CREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + 2 * *ldA * n1; | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// recursion(A_TL) | |||
RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL / A_TL' | |||
BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
// A_BR = A_BR - A_BL * A_BL' | |||
BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
} else { | |||
// A_TR = A_TL' \ A_TR | |||
BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
// A_BR = A_BR - A_TR' * A_TR | |||
BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,238 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, | |||
float *, const int *, int *, float *, const int *, int *); | |||
/** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's csytrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html | |||
* */ | |||
void RELAPACK_csytrf( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
float *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CSYTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy arguments | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** csytrf's recursive compute kernel */ | |||
static void RELAPACK_csytrf_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_CSYTRF, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterator | |||
int i; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = CREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
float *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
float *const A_BL_B = A + 2 * *n; | |||
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
float *const Work_BL = Work + 2 * n1; | |||
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
// last row of A_BL | |||
float *const A_BL_b = A_BL + 2 * n2_out; | |||
// last row of Work_BL | |||
float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = CREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
float *const Work_L = Work; | |||
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,451 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static complex c_b1 = {1.f,0.f}; | |||
static int c__1 = 1; | |||
/** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. | |||
* | |||
* This routine is a minor modification of LAPACK's clasyf. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int * | |||
nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, | |||
int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
float r__1, r__2, r__3, r__4; | |||
complex q__1, q__2, q__3; | |||
/* Builtin functions */ | |||
double sqrt(double), r_imag(complex *); | |||
void c_div(complex *, complex *, complex *); | |||
/* Local variables */ | |||
static int j, k; | |||
static complex t, r1, d11, d21, d22; | |||
static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
static float alpha; | |||
extern /* Subroutine */ int cscal_(int *, complex *, complex *, | |||
int *); | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
, complex *, int *, complex *, int *, complex *, complex * | |||
, int *, ftnlen), ccopy_(int *, complex *, int *, | |||
complex *, int *), cswap_(int *, complex *, int *, | |||
complex *, int *); | |||
static int kstep; | |||
static float absakk; | |||
extern int icamax_(int *, complex *, int *); | |||
static float colmax, rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
} | |||
kstep = 1; | |||
i__1 = k + kw * w_dim1; | |||
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * | |||
w_dim1]), dabs(r__2)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = imax + kw * w_dim1; | |||
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
+ kw * w_dim1]), dabs(r__2)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = k - imax; | |||
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
i__1 = k - imax; | |||
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
&c__1); | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
/* Computing MAX */ | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( | |||
r__2)); | |||
rowmax = dmax(r__3,r__4); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else /* if(complicated condition) */ { | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha * | |||
rowmax) { | |||
kp = imax; | |||
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
i__1 = kk - 1 - kp; | |||
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
if (kp > 1) { | |||
i__1 = kp - 1; | |||
ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
+ 1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
+ 1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
r1.r = q__1.r, r1.i = q__1.i; | |||
i__1 = k - 1; | |||
cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
if (k > 2) { | |||
i__1 = k - 1 + kw * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
c_div(&q__1, &w[k + kw * w_dim1], &d21); | |||
d11.r = q__1.r, d11.i = q__1.i; | |||
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
d22.r = q__1.r, d22.i = q__1.i; | |||
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
c_div(&q__1, &c_b1, &q__2); | |||
t.r = q__1.r, t.i = q__1.i; | |||
c_div(&q__1, &t, &d21); | |||
d21.r = q__1.r, d21.i = q__1.i; | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
i__2 = j + (k - 1) * a_dim1; | |||
i__3 = j + (kw - 1) * w_dim1; | |||
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + kw * w_dim1; | |||
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
.i; | |||
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
d21.r * q__2.i + d21.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + kw * w_dim1; | |||
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + (kw - 1) * w_dim1; | |||
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
.i; | |||
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
d21.r * q__2.i + d21.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L20: */ | |||
} | |||
} | |||
i__1 = k - 1 + (k - 1) * a_dim1; | |||
i__2 = k - 1 + (kw - 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1 + k * a_dim1; | |||
i__2 = k - 1 + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
++j; | |||
} | |||
++j; | |||
if (jp != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
} | |||
if (j < *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k | |||
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
kstep = 1; | |||
i__1 = k + k * w_dim1; | |||
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * | |||
w_dim1]), dabs(r__2)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = imax + k * w_dim1; | |||
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
+ k * w_dim1]), dabs(r__2)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
i__1 = imax - k; | |||
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = *n - imax + 1; | |||
ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
1) * w_dim1], &c__1); | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], | |||
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
w_dim1], &c__1, (ftnlen)12); | |||
i__1 = imax - k; | |||
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
; | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
/* Computing MAX */ | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( | |||
r__2)); | |||
rowmax = dmax(r__3,r__4); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else /* if(complicated condition) */ { | |||
i__1 = imax + (k + 1) * w_dim1; | |||
if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha * | |||
rowmax) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
k * w_dim1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
i__1 = kp - kk - 1; | |||
ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
1) * a_dim1], lda); | |||
if (kp < *n) { | |||
i__1 = *n - kp; | |||
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
+ kp * a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
} | |||
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
r1.r = q__1.r, r1.i = q__1.i; | |||
i__1 = *n - k; | |||
cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
i__1 = k + 1 + k * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
d11.r = q__1.r, d11.i = q__1.i; | |||
c_div(&q__1, &w[k + k * w_dim1], &d21); | |||
d22.r = q__1.r, d22.i = q__1.i; | |||
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
c_div(&q__1, &c_b1, &q__2); | |||
t.r = q__1.r, t.i = q__1.i; | |||
c_div(&q__1, &t, &d21); | |||
d21.r = q__1.r, d21.i = q__1.i; | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + k * w_dim1; | |||
q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + (k + 1) * w_dim1; | |||
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
.i; | |||
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
d21.r * q__2.i + d21.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
i__2 = j + (k + 1) * a_dim1; | |||
i__3 = j + (k + 1) * w_dim1; | |||
q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + k * w_dim1; | |||
q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
.i; | |||
q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
d21.r * q__2.i + d21.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L80: */ | |||
} | |||
} | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + k * a_dim1; | |||
i__2 = k + 1 + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + (k + 1) * a_dim1; | |||
i__2 = k + 1 + (k + 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
--j; | |||
} | |||
--j; | |||
if (jp != jj && j >= 1) { | |||
cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j > 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,236 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *, | |||
float *, const int *, int *, float *, const int *, int *); | |||
/** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's csytrf_rook. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html | |||
* */ | |||
void RELAPACK_csytrf_rook( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
float *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CSYTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy argument | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** csytrf_rook's recursive compute kernel */ | |||
static void RELAPACK_csytrf_rook_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = CREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
float *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
float *const A_BL_B = A + 2 * *n; | |||
float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
float *const Work_BL = Work + 2 * n1; | |||
float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
// last row of A_BL | |||
float *const A_BL_b = A_BL + 2 * n2_out; | |||
// last row of Work_BL | |||
float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = CREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
float *const Work_L = Work; | |||
float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,565 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static complex c_b1 = {1.f,0.f}; | |||
static int c__1 = 1; | |||
/** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. | |||
* | |||
* This routine is a minor modification of LAPACK's clasyf_rook. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n, | |||
int *nb, int *kb, complex *a, int *lda, int *ipiv, | |||
complex *w, int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
float r__1, r__2; | |||
complex q__1, q__2, q__3, q__4; | |||
/* Builtin functions */ | |||
double sqrt(double), r_imag(complex *); | |||
void c_div(complex *, complex *, complex *); | |||
/* Local variables */ | |||
static int j, k, p; | |||
static complex t, r1, d11, d12, d21, d22; | |||
static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
static logical done; | |||
static int imax, jmax; | |||
static float alpha; | |||
extern /* Subroutine */ int cscal_(int *, complex *, complex *, | |||
int *); | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
, complex *, int *, complex *, int *, complex *, complex * | |||
, int *, ftnlen); | |||
static float sfmin; | |||
extern /* Subroutine */ int ccopy_(int *, complex *, int *, | |||
complex *, int *); | |||
static int itemp; | |||
extern /* Subroutine */ int cswap_(int *, complex *, int *, | |||
complex *, int *); | |||
static int kstep; | |||
static float stemp, absakk; | |||
extern int icamax_(int *, complex *, int *); | |||
extern double slamch_(char *, ftnlen); | |||
static float colmax, rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
sfmin = slamch_("S", (ftnlen)1); | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
kstep = 1; | |||
p = k; | |||
ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
} | |||
i__1 = k + kw * w_dim1; | |||
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * | |||
w_dim1]), dabs(r__2)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = imax + kw * w_dim1; | |||
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
+ kw * w_dim1]), dabs(r__2)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L12: | |||
ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = k - imax; | |||
ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
if (imax != k) { | |||
i__1 = k - imax; | |||
jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
w_dim1], &c__1); | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
} else { | |||
rowmax = 0.f; | |||
} | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
i__1 = itemp + (kw - 1) * w_dim1; | |||
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); | |||
if (stemp > rowmax) { | |||
rowmax = stemp; | |||
jmax = itemp; | |||
} | |||
} | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha * | |||
rowmax)) { | |||
kp = imax; | |||
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} | |||
if (! done) { | |||
goto L12; | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kstep == 2 && p != k) { | |||
i__1 = k - p; | |||
ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
a_dim1], lda); | |||
ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & | |||
c__1); | |||
i__1 = *n - k + 1; | |||
cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], | |||
lda); | |||
i__1 = *n - kk + 1; | |||
cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
ldw); | |||
} | |||
if (kp != kk) { | |||
i__1 = kp + k * a_dim1; | |||
i__2 = kk + k * a_dim1; | |||
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
i__1 = k - 1 - kp; | |||
ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & | |||
c__1); | |||
i__1 = *n - kk + 1; | |||
cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], | |||
lda); | |||
i__1 = *n - kk + 1; | |||
cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
if (k > 1) { | |||
i__1 = k + k * a_dim1; | |||
if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k | |||
+ k * a_dim1]), dabs(r__2)) >= sfmin) { | |||
c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
r1.r = q__1.r, r1.i = q__1.i; | |||
i__1 = k - 1; | |||
cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else /* if(complicated condition) */ { | |||
i__1 = k + k * a_dim1; | |||
if (a[i__1].r != 0.f || a[i__1].i != 0.f) { | |||
i__1 = k - 1; | |||
for (ii = 1; ii <= i__1; ++ii) { | |||
i__2 = ii + k * a_dim1; | |||
c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * | |||
a_dim1]); | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L14: */ | |||
} | |||
} | |||
} | |||
} | |||
} else { | |||
if (k > 2) { | |||
i__1 = k - 1 + kw * w_dim1; | |||
d12.r = w[i__1].r, d12.i = w[i__1].i; | |||
c_div(&q__1, &w[k + kw * w_dim1], &d12); | |||
d11.r = q__1.r, d11.i = q__1.i; | |||
c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12); | |||
d22.r = q__1.r, d22.i = q__1.i; | |||
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
c_div(&q__1, &c_b1, &q__2); | |||
t.r = q__1.r, t.i = q__1.i; | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
i__2 = j + (k - 1) * a_dim1; | |||
i__3 = j + (kw - 1) * w_dim1; | |||
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + kw * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
c_div(&q__2, &q__3, &d12); | |||
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
q__2.i + t.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + kw * w_dim1; | |||
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + (kw - 1) * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
c_div(&q__2, &q__3, &d12); | |||
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
q__2.i + t.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L20: */ | |||
} | |||
} | |||
i__1 = k - 1 + (k - 1) * a_dim1; | |||
i__2 = k - 1 + (kw - 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1 + k * a_dim1; | |||
i__2 = k - 1 + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
++j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
++j; | |||
if (jp2 != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
jj = j - 1; | |||
if (jp1 != jj && kstep == 2) { | |||
i__1 = *n - j + 1; | |||
cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
if (j <= *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
kstep = 1; | |||
p = k; | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & | |||
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
i__1 = k + k * w_dim1; | |||
absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * | |||
w_dim1]), dabs(r__2)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = imax + k * w_dim1; | |||
colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
+ k * w_dim1]), dabs(r__2)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L72: | |||
i__1 = imax - k; | |||
ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = *n - imax + 1; | |||
ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
1) * w_dim1], &c__1); | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
q__1.r = -1.f, q__1.i = -0.f; | |||
cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] | |||
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
1) * w_dim1], &c__1, (ftnlen)12); | |||
} | |||
if (imax != k) { | |||
i__1 = imax - k; | |||
jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
c__1); | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
w[jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
} else { | |||
rowmax = 0.f; | |||
} | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = itemp + (k + 1) * w_dim1; | |||
stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
w[itemp + (k + 1) * w_dim1]), dabs(r__2)); | |||
if (stemp > rowmax) { | |||
rowmax = stemp; | |||
jmax = itemp; | |||
} | |||
} | |||
i__1 = imax + (k + 1) * w_dim1; | |||
if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha * | |||
rowmax)) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
} | |||
if (! done) { | |||
goto L72; | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kstep == 2 && p != k) { | |||
i__1 = p - k; | |||
ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], | |||
lda); | |||
i__1 = *n - p + 1; | |||
ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & | |||
c__1); | |||
cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
} | |||
if (kp != kk) { | |||
i__1 = kp + k * a_dim1; | |||
i__2 = kk + k * a_dim1; | |||
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
i__1 = kp - k - 1; | |||
ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) | |||
* a_dim1], lda); | |||
i__1 = *n - kp + 1; | |||
ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * | |||
a_dim1], &c__1); | |||
cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
i__1 = k + k * a_dim1; | |||
if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k | |||
+ k * a_dim1]), dabs(r__2)) >= sfmin) { | |||
c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
r1.r = q__1.r, r1.i = q__1.i; | |||
i__1 = *n - k; | |||
cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} else /* if(complicated condition) */ { | |||
i__1 = k + k * a_dim1; | |||
if (a[i__1].r != 0.f || a[i__1].i != 0.f) { | |||
i__1 = *n; | |||
for (ii = k + 1; ii <= i__1; ++ii) { | |||
i__2 = ii + k * a_dim1; | |||
c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * | |||
a_dim1]); | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L74: */ | |||
} | |||
} | |||
} | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
i__1 = k + 1 + k * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
d11.r = q__1.r, d11.i = q__1.i; | |||
c_div(&q__1, &w[k + k * w_dim1], &d21); | |||
d22.r = q__1.r, d22.i = q__1.i; | |||
q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
c_div(&q__1, &c_b1, &q__2); | |||
t.r = q__1.r, t.i = q__1.i; | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + k * w_dim1; | |||
q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + (k + 1) * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
c_div(&q__2, &q__3, &d21); | |||
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
q__2.i + t.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
i__2 = j + (k + 1) * a_dim1; | |||
i__3 = j + (k + 1) * w_dim1; | |||
q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + k * w_dim1; | |||
q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
.i; | |||
c_div(&q__2, &q__3, &d21); | |||
q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
q__2.i + t.i * q__2.r; | |||
a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
/* L80: */ | |||
} | |||
} | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + k * a_dim1; | |||
i__2 = k + 1 + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + (k + 1) * a_dim1; | |||
i__2 = k + 1 + (k + 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
--j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
--j; | |||
if (jp2 != jj && j >= 1) { | |||
cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
jj = j + 1; | |||
if (jp1 != jj && kstep == 2) { | |||
cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j >= 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,268 @@ | |||
#include "relapack.h" | |||
#include <math.h> | |||
static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, | |||
const int *, const float *, const int *, const float *, const int *, | |||
float *, const int *, const float *, const int *, const float *, | |||
const int *, float *, const int *, float *, float *, float *, int *); | |||
/** CTGSYL solves the generalized Sylvester equation. | |||
* | |||
* This routine is functionally equivalent to LAPACK's ctgsyl. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html | |||
* */ | |||
void RELAPACK_ctgsyl( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, | |||
const float *D, const int *ldD, const float *E, const int *ldE, | |||
float *F, const int *ldF, | |||
float *scale, float *dif, | |||
float *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
// Parse arguments | |||
const int notran = LAPACK(lsame)(trans, "N"); | |||
const int tran = LAPACK(lsame)(trans, "C"); | |||
// Compute work buffer size | |||
int lwmin = 1; | |||
if (notran && (*ijob == 1 || *ijob == 2)) | |||
lwmin = MAX(1, 2 * *m * *n); | |||
*info = 0; | |||
// Check arguments | |||
if (!tran && !notran) | |||
*info = -1; | |||
else if (notran && (*ijob < 0 || *ijob > 4)) | |||
*info = -2; | |||
else if (*m <= 0) | |||
*info = -3; | |||
else if (*n <= 0) | |||
*info = -4; | |||
else if (*ldA < MAX(1, *m)) | |||
*info = -6; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -8; | |||
else if (*ldC < MAX(1, *m)) | |||
*info = -10; | |||
else if (*ldD < MAX(1, *m)) | |||
*info = -12; | |||
else if (*ldE < MAX(1, *n)) | |||
*info = -14; | |||
else if (*ldF < MAX(1, *m)) | |||
*info = -16; | |||
else if (*lWork < lwmin && *lWork != -1) | |||
*info = -20; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CTGSYL", &minfo); | |||
return; | |||
} | |||
if (*lWork == -1) { | |||
// Work size query | |||
*Work = lwmin; | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleantrans = notran ? 'N' : 'C'; | |||
// Constant | |||
const float ZERO[] = { 0., 0. }; | |||
int isolve = 1; | |||
int ifunc = 0; | |||
if (notran) { | |||
if (*ijob >= 3) { | |||
ifunc = *ijob - 2; | |||
LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); | |||
LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); | |||
} else if (*ijob >= 1) | |||
isolve = 2; | |||
} | |||
float scale2; | |||
int iround; | |||
for (iround = 1; iround <= isolve; iround++) { | |||
*scale = 1; | |||
float dscale = 0; | |||
float dsum = 1; | |||
RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info); | |||
if (dscale != 0) { | |||
if (*ijob == 1 || *ijob == 3) | |||
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); | |||
else | |||
*dif = sqrt(*m * *n) / (dscale * sqrt(dsum)); | |||
} | |||
if (isolve == 2) { | |||
if (iround == 1) { | |||
if (notran) | |||
ifunc = *ijob; | |||
scale2 = *scale; | |||
LAPACK(clacpy)("F", m, n, C, ldC, Work, m); | |||
LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m); | |||
LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); | |||
LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); | |||
} else { | |||
LAPACK(clacpy)("F", m, n, Work, m, C, ldC); | |||
LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF); | |||
*scale = scale2; | |||
} | |||
} | |||
} | |||
} | |||
/** ctgsyl's recursive vompute kernel */ | |||
static void RELAPACK_ctgsyl_rec( | |||
const char *trans, const int *ifunc, const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, | |||
const float *D, const int *ldD, const float *E, const int *ldE, | |||
float *F, const int *ldF, | |||
float *scale, float *dsum, float *dscale, | |||
int *info | |||
) { | |||
if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) { | |||
// Unblocked | |||
LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
// Outputs | |||
float scale1[] = { 1., 0. }; | |||
float scale2[] = { 1., 0. }; | |||
int info1[] = { 0 }; | |||
int info2[] = { 0 }; | |||
if (*m > *n) { | |||
// Splitting | |||
const int m1 = CREC_SPLIT(*m); | |||
const int m2 = *m - m1; | |||
// A_TL A_TR | |||
// 0 A_BR | |||
const float *const A_TL = A; | |||
const float *const A_TR = A + 2 * *ldA * m1; | |||
const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
// C_T | |||
// C_B | |||
float *const C_T = C; | |||
float *const C_B = C + 2 * m1; | |||
// D_TL D_TR | |||
// 0 D_BR | |||
const float *const D_TL = D; | |||
const float *const D_TR = D + 2 * *ldD * m1; | |||
const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1; | |||
// F_T | |||
// F_B | |||
float *const F_T = F; | |||
float *const F_B = F + 2 * m1; | |||
if (*trans == 'N') { | |||
// recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1); | |||
// C_T = C_T - A_TR * C_B | |||
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
// F_T = F_T - D_TR * C_B | |||
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); | |||
// recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); | |||
} | |||
} else { | |||
// recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1); | |||
// apply scale | |||
if (scale1[0] != 1) | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); | |||
// C_B = C_B - A_TR^H * C_T | |||
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
// C_B = C_B - D_TR^H * F_T | |||
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); | |||
// recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); | |||
} | |||
} | |||
} else { | |||
// Splitting | |||
const int n1 = CREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// B_TL B_TR | |||
// 0 B_BR | |||
const float *const B_TL = B; | |||
const float *const B_TR = B + 2 * *ldB * n1; | |||
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
// C_L C_R | |||
float *const C_L = C; | |||
float *const C_R = C + 2 * *ldC * n1; | |||
// E_TL E_TR | |||
// 0 E_BR | |||
const float *const E_TL = E; | |||
const float *const E_TR = E + 2 * *ldE * n1; | |||
const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1; | |||
// F_L F_R | |||
float *const F_L = F; | |||
float *const F_R = F + 2 * *ldF * n1; | |||
if (*trans == 'N') { | |||
// recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1); | |||
// C_R = C_R + F_L * B_TR | |||
BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); | |||
// F_R = F_R + F_L * E_TR | |||
BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); | |||
// recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); | |||
} | |||
} else { | |||
// recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1); | |||
// apply scale | |||
if (scale1[0] != 1) | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); | |||
// F_L = F_L + C_R * B_TR | |||
BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); | |||
// F_L = F_L + F_R * E_TR | |||
BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); | |||
// recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); | |||
} | |||
} | |||
} | |||
*scale = scale1[0] * scale2[0]; | |||
*info = info1[0] || info2[0]; | |||
} |
@@ -0,0 +1,163 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, | |||
const int *, const int *, const float *, const int *, const float *, | |||
const int *, float *, const int *, float *, int *); | |||
/** CTRSYL solves the complex Sylvester matrix equation. | |||
* | |||
* This routine is functionally equivalent to LAPACK's ctrsyl. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html | |||
* */ | |||
void RELAPACK_ctrsyl( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, float *scale, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int notransA = LAPACK(lsame)(tranA, "N"); | |||
const int ctransA = LAPACK(lsame)(tranA, "C"); | |||
const int notransB = LAPACK(lsame)(tranB, "N"); | |||
const int ctransB = LAPACK(lsame)(tranB, "C"); | |||
*info = 0; | |||
if (!ctransA && !notransA) | |||
*info = -1; | |||
else if (!ctransB && !notransB) | |||
*info = -2; | |||
else if (*isgn != 1 && *isgn != -1) | |||
*info = -3; | |||
else if (*m < 0) | |||
*info = -4; | |||
else if (*n < 0) | |||
*info = -5; | |||
else if (*ldA < MAX(1, *m)) | |||
*info = -7; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -9; | |||
else if (*ldC < MAX(1, *m)) | |||
*info = -11; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CTRSYL", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleantranA = notransA ? 'N' : 'C'; | |||
const char cleantranB = notransB ? 'N' : 'C'; | |||
// Recursive kernel | |||
RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
/** ctrsyl's recursive compute kernel */ | |||
static void RELAPACK_ctrsyl_rec( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, float *scale, | |||
int *info | |||
) { | |||
if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) { | |||
// Unblocked | |||
RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
const float MSGN[] = { -*isgn, 0. }; | |||
const int iONE[] = { 1 }; | |||
// Outputs | |||
float scale1[] = { 1., 0. }; | |||
float scale2[] = { 1., 0. }; | |||
int info1[] = { 0 }; | |||
int info2[] = { 0 }; | |||
if (*m > *n) { | |||
// Splitting | |||
const int m1 = CREC_SPLIT(*m); | |||
const int m2 = *m - m1; | |||
// A_TL A_TR | |||
// 0 A_BR | |||
const float *const A_TL = A; | |||
const float *const A_TR = A + 2 * *ldA * m1; | |||
const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
// C_T | |||
// C_B | |||
float *const C_T = C; | |||
float *const C_B = C + 2 * m1; | |||
if (*tranA == 'N') { | |||
// recusion(A_BR, B, C_B) | |||
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); | |||
// C_T = C_T - A_TR * C_B | |||
BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
// recusion(A_TL, B, C_T) | |||
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
} else { | |||
// recusion(A_TL, B, C_T) | |||
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); | |||
// C_B = C_B - A_TR' * C_T | |||
BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
// recusion(A_BR, B, C_B) | |||
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); | |||
} | |||
} else { | |||
// Splitting | |||
const int n1 = CREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// B_TL B_TR | |||
// 0 B_BR | |||
const float *const B_TL = B; | |||
const float *const B_TR = B + 2 * *ldB * n1; | |||
const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
// C_L C_R | |||
float *const C_L = C; | |||
float *const C_R = C + 2 * *ldC * n1; | |||
if (*tranB == 'N') { | |||
// recusion(A, B_TL, C_L) | |||
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); | |||
// C_R = C_R -/+ C_L * B_TR | |||
BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); | |||
// recusion(A, B_BR, C_R) | |||
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
} else { | |||
// recusion(A, B_BR, C_R) | |||
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); | |||
// C_L = C_L -/+ C_R * B_TR' | |||
BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); | |||
// recusion(A, B_TL, C_L) | |||
RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
} | |||
} | |||
*scale = scale1[0] * scale2[0]; | |||
*info = info1[0] || info2[0]; | |||
} |
@@ -0,0 +1,392 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "../config.h" | |||
#include "f2c.h" | |||
#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) { | |||
extern void cdotu_(complex *, int *, complex *, int *, complex *, int *); | |||
complex result; | |||
cdotu_(&result, n, x, incx, y, incy); | |||
return result; | |||
} | |||
#define cdotu_ cdotu_fun | |||
complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) { | |||
extern void cdotc_(complex *, int *, complex *, int *, complex *, int *); | |||
complex result; | |||
cdotc_(&result, n, x, incx, y, incy); | |||
return result; | |||
} | |||
#define cdotc_ cdotc_fun | |||
#endif | |||
#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
complex cladiv_fun(complex *a, complex *b) { | |||
extern void cladiv_(complex *, complex *, complex *); | |||
complex result; | |||
cladiv_(&result, a, b); | |||
return result; | |||
} | |||
#define cladiv_ cladiv_fun | |||
#endif | |||
/* Table of constant values */ | |||
static int c__1 = 1; | |||
/** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) | |||
* | |||
* This routine is an exact copy of LAPACK's ctrsyl. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* */ | |||
/* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int | |||
*isgn, int *m, int *n, complex *a, int *lda, complex *b, | |||
int *ldb, complex *c__, int *ldc, float *scale, int *info, | |||
ftnlen trana_len, ftnlen tranb_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, | |||
i__3, i__4; | |||
float r__1, r__2; | |||
complex q__1, q__2, q__3, q__4; | |||
/* Builtin functions */ | |||
float r_imag(complex *); | |||
void r_cnjg(complex *, complex *); | |||
/* Local variables */ | |||
static int j, k, l; | |||
static complex a11; | |||
static float db; | |||
static complex x11; | |||
static float da11; | |||
static complex vec; | |||
static float dum[1], eps, sgn, smin; | |||
static complex suml, sumr; | |||
/* Complex */ complex cdotc_(int *, complex *, int | |||
*, complex *, int *); | |||
extern int lsame_(char *, char *, ftnlen, ftnlen); | |||
/* Complex */ complex cdotu_(int *, complex *, int | |||
*, complex *, int *); | |||
extern /* Subroutine */ int slabad_(float *, float *); | |||
extern float clange_(char *, int *, int *, complex *, | |||
int *, float *, ftnlen); | |||
/* Complex */ complex cladiv_(complex *, complex *); | |||
static float scaloc; | |||
extern float slamch_(char *, ftnlen); | |||
extern /* Subroutine */ int csscal_(int *, float *, complex *, int | |||
*), xerbla_(char *, int *, ftnlen); | |||
static float bignum; | |||
static int notrna, notrnb; | |||
static float smlnum; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
b_dim1 = *ldb; | |||
b_offset = 1 + b_dim1; | |||
b -= b_offset; | |||
c_dim1 = *ldc; | |||
c_offset = 1 + c_dim1; | |||
c__ -= c_offset; | |||
/* Function Body */ | |||
notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); | |||
notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); | |||
*info = 0; | |||
if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) { | |||
*info = -1; | |||
} else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { | |||
*info = -2; | |||
} else if (*isgn != 1 && *isgn != -1) { | |||
*info = -3; | |||
} else if (*m < 0) { | |||
*info = -4; | |||
} else if (*n < 0) { | |||
*info = -5; | |||
} else if (*lda < max(1,*m)) { | |||
*info = -7; | |||
} else if (*ldb < max(1,*n)) { | |||
*info = -9; | |||
} else if (*ldc < max(1,*m)) { | |||
*info = -11; | |||
} | |||
if (*info != 0) { | |||
i__1 = -(*info); | |||
xerbla_("CTRSY2", &i__1, (ftnlen)6); | |||
return; | |||
} | |||
*scale = 1.f; | |||
if (*m == 0 || *n == 0) { | |||
return; | |||
} | |||
eps = slamch_("P", (ftnlen)1); | |||
smlnum = slamch_("S", (ftnlen)1); | |||
bignum = 1.f / smlnum; | |||
slabad_(&smlnum, &bignum); | |||
smlnum = smlnum * (float) (*m * *n) / eps; | |||
bignum = 1.f / smlnum; | |||
/* Computing MAX */ | |||
r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, ( | |||
ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n, | |||
&b[b_offset], ldb, dum, (ftnlen)1); | |||
smin = dmax(r__1,r__2); | |||
sgn = (float) (*isgn); | |||
if (notrna && notrnb) { | |||
i__1 = *n; | |||
for (l = 1; l <= i__1; ++l) { | |||
for (k = *m; k >= 1; --k) { | |||
i__2 = *m - k; | |||
/* Computing MIN */ | |||
i__3 = k + 1; | |||
/* Computing MIN */ | |||
i__4 = k + 1; | |||
q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[ | |||
min(i__4,*m) + l * c_dim1], &c__1); | |||
suml.r = q__1.r, suml.i = q__1.i; | |||
i__2 = l - 1; | |||
q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
, &c__1); | |||
sumr.r = q__1.r, sumr.i = q__1.i; | |||
i__2 = k + l * c_dim1; | |||
q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; | |||
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; | |||
vec.r = q__1.r, vec.i = q__1.i; | |||
scaloc = 1.f; | |||
i__2 = k + k * a_dim1; | |||
i__3 = l + l * b_dim1; | |||
q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i; | |||
q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i; | |||
a11.r = q__1.r, a11.i = q__1.i; | |||
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
dabs(r__2)); | |||
if (da11 <= smin) { | |||
a11.r = smin, a11.i = 0.f; | |||
da11 = smin; | |||
*info = 1; | |||
} | |||
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
r__2)); | |||
if (da11 < 1.f && db > 1.f) { | |||
if (db > bignum * da11) { | |||
scaloc = 1.f / db; | |||
} | |||
} | |||
q__3.r = scaloc, q__3.i = 0.f; | |||
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
q__3.i + vec.i * q__3.r; | |||
q__1 = cladiv_(&q__2, &a11); | |||
x11.r = q__1.r, x11.i = q__1.i; | |||
if (scaloc != 1.f) { | |||
i__2 = *n; | |||
for (j = 1; j <= i__2; ++j) { | |||
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
/* L10: */ | |||
} | |||
*scale *= scaloc; | |||
} | |||
i__2 = k + l * c_dim1; | |||
c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
/* L20: */ | |||
} | |||
/* L30: */ | |||
} | |||
} else if (! notrna && notrnb) { | |||
i__1 = *n; | |||
for (l = 1; l <= i__1; ++l) { | |||
i__2 = *m; | |||
for (k = 1; k <= i__2; ++k) { | |||
i__3 = k - 1; | |||
q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
c_dim1 + 1], &c__1); | |||
suml.r = q__1.r, suml.i = q__1.i; | |||
i__3 = l - 1; | |||
q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
, &c__1); | |||
sumr.r = q__1.r, sumr.i = q__1.i; | |||
i__3 = k + l * c_dim1; | |||
q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; | |||
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i; | |||
vec.r = q__1.r, vec.i = q__1.i; | |||
scaloc = 1.f; | |||
r_cnjg(&q__2, &a[k + k * a_dim1]); | |||
i__3 = l + l * b_dim1; | |||
q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; | |||
q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; | |||
a11.r = q__1.r, a11.i = q__1.i; | |||
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
dabs(r__2)); | |||
if (da11 <= smin) { | |||
a11.r = smin, a11.i = 0.f; | |||
da11 = smin; | |||
*info = 1; | |||
} | |||
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
r__2)); | |||
if (da11 < 1.f && db > 1.f) { | |||
if (db > bignum * da11) { | |||
scaloc = 1.f / db; | |||
} | |||
} | |||
q__3.r = scaloc, q__3.i = 0.f; | |||
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
q__3.i + vec.i * q__3.r; | |||
q__1 = cladiv_(&q__2, &a11); | |||
x11.r = q__1.r, x11.i = q__1.i; | |||
if (scaloc != 1.f) { | |||
i__3 = *n; | |||
for (j = 1; j <= i__3; ++j) { | |||
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
/* L40: */ | |||
} | |||
*scale *= scaloc; | |||
} | |||
i__3 = k + l * c_dim1; | |||
c__[i__3].r = x11.r, c__[i__3].i = x11.i; | |||
/* L50: */ | |||
} | |||
/* L60: */ | |||
} | |||
} else if (! notrna && ! notrnb) { | |||
for (l = *n; l >= 1; --l) { | |||
i__1 = *m; | |||
for (k = 1; k <= i__1; ++k) { | |||
i__2 = k - 1; | |||
q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
c_dim1 + 1], &c__1); | |||
suml.r = q__1.r, suml.i = q__1.i; | |||
i__2 = *n - l; | |||
/* Computing MIN */ | |||
i__3 = l + 1; | |||
/* Computing MIN */ | |||
i__4 = l + 1; | |||
q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[ | |||
l + min(i__4,*n) * b_dim1], ldb); | |||
sumr.r = q__1.r, sumr.i = q__1.i; | |||
i__2 = k + l * c_dim1; | |||
r_cnjg(&q__4, &sumr); | |||
q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; | |||
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; | |||
vec.r = q__1.r, vec.i = q__1.i; | |||
scaloc = 1.f; | |||
i__2 = k + k * a_dim1; | |||
i__3 = l + l * b_dim1; | |||
q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; | |||
q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i; | |||
r_cnjg(&q__1, &q__2); | |||
a11.r = q__1.r, a11.i = q__1.i; | |||
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
dabs(r__2)); | |||
if (da11 <= smin) { | |||
a11.r = smin, a11.i = 0.f; | |||
da11 = smin; | |||
*info = 1; | |||
} | |||
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
r__2)); | |||
if (da11 < 1.f && db > 1.f) { | |||
if (db > bignum * da11) { | |||
scaloc = 1.f / db; | |||
} | |||
} | |||
q__3.r = scaloc, q__3.i = 0.f; | |||
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
q__3.i + vec.i * q__3.r; | |||
q__1 = cladiv_(&q__2, &a11); | |||
x11.r = q__1.r, x11.i = q__1.i; | |||
if (scaloc != 1.f) { | |||
i__2 = *n; | |||
for (j = 1; j <= i__2; ++j) { | |||
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
/* L70: */ | |||
} | |||
*scale *= scaloc; | |||
} | |||
i__2 = k + l * c_dim1; | |||
c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
/* L80: */ | |||
} | |||
/* L90: */ | |||
} | |||
} else if (notrna && ! notrnb) { | |||
for (l = *n; l >= 1; --l) { | |||
for (k = *m; k >= 1; --k) { | |||
i__1 = *m - k; | |||
/* Computing MIN */ | |||
i__2 = k + 1; | |||
/* Computing MIN */ | |||
i__3 = k + 1; | |||
q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[ | |||
min(i__3,*m) + l * c_dim1], &c__1); | |||
suml.r = q__1.r, suml.i = q__1.i; | |||
i__1 = *n - l; | |||
/* Computing MIN */ | |||
i__2 = l + 1; | |||
/* Computing MIN */ | |||
i__3 = l + 1; | |||
q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[ | |||
l + min(i__3,*n) * b_dim1], ldb); | |||
sumr.r = q__1.r, sumr.i = q__1.i; | |||
i__1 = k + l * c_dim1; | |||
r_cnjg(&q__4, &sumr); | |||
q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; | |||
q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i; | |||
vec.r = q__1.r, vec.i = q__1.i; | |||
scaloc = 1.f; | |||
i__1 = k + k * a_dim1; | |||
r_cnjg(&q__3, &b[l + l * b_dim1]); | |||
q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i; | |||
q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i; | |||
a11.r = q__1.r, a11.i = q__1.i; | |||
da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
dabs(r__2)); | |||
if (da11 <= smin) { | |||
a11.r = smin, a11.i = 0.f; | |||
da11 = smin; | |||
*info = 1; | |||
} | |||
db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
r__2)); | |||
if (da11 < 1.f && db > 1.f) { | |||
if (db > bignum * da11) { | |||
scaloc = 1.f / db; | |||
} | |||
} | |||
q__3.r = scaloc, q__3.i = 0.f; | |||
q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
q__3.i + vec.i * q__3.r; | |||
q__1 = cladiv_(&q__2, &a11); | |||
x11.r = q__1.r, x11.i = q__1.i; | |||
if (scaloc != 1.f) { | |||
i__1 = *n; | |||
for (j = 1; j <= i__1; ++j) { | |||
csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
/* L100: */ | |||
} | |||
*scale *= scaloc; | |||
} | |||
i__1 = k + l * c_dim1; | |||
c__[i__1].r = x11.r, c__[i__1].i = x11.i; | |||
/* L110: */ | |||
} | |||
/* L120: */ | |||
} | |||
} | |||
return; | |||
} |
@@ -0,0 +1,107 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, | |||
float *, const int *, int *); | |||
/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's ctrtri. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html | |||
* */ | |||
void RELAPACK_ctrtri( | |||
const char *uplo, const char *diag, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
const int nounit = LAPACK(lsame)(diag, "N"); | |||
const int unit = LAPACK(lsame)(diag, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (!nounit && !unit) | |||
*info = -2; | |||
else if (*n < 0) | |||
*info = -3; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -5; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("CTRTRI", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
const char cleandiag = nounit ? 'N' : 'U'; | |||
// check for singularity | |||
if (nounit) { | |||
int i; | |||
for (i = 0; i < *n; i++) | |||
if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { | |||
*info = i; | |||
return; | |||
} | |||
} | |||
// Recursive kernel | |||
RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); | |||
} | |||
/** ctrtri's recursive compute kernel */ | |||
static void RELAPACK_ctrtri_rec( | |||
const char *uplo, const char *diag, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_CTRTRI, 1)) { | |||
// Unblocked | |||
LAPACK(ctrti2)(uplo, diag, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1., 0. }; | |||
const float MONE[] = { -1., 0. }; | |||
// Splitting | |||
const int n1 = CREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + 2 * *ldA * n1; | |||
float *const A_BL = A + 2 * n1; | |||
float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// recursion(A_TL) | |||
RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
if (*uplo == 'L') { | |||
// A_BL = - A_BL * A_TL | |||
BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); | |||
// A_BL = A_BR \ A_BL | |||
BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
} else { | |||
// A_TR = - A_TL * A_TR | |||
BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); | |||
// A_TR = A_TR / A_BR | |||
BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,227 @@ | |||
#include "relapack.h" | |||
#include "stdlib.h" | |||
static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, | |||
const int *, double *, const int *, int *, double *, const int *, double *, | |||
const int *, int *); | |||
/** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dgbtrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html | |||
* */ | |||
void RELAPACK_dgbtrf( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
double *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
// Check arguments | |||
*info = 0; | |||
if (*m < 0) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*kl < 0) | |||
*info = -3; | |||
else if (*ku < 0) | |||
*info = -4; | |||
else if (*ldAb < 2 * *kl + *ku + 1) | |||
*info = -6; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DGBTRF", &minfo); | |||
return; | |||
} | |||
// Constant | |||
const double ZERO[] = { 0. }; | |||
// Result upper band width | |||
const int kv = *ku + *kl; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
double *const A = Ab + kv; | |||
// Zero upper diagonal fill-in elements | |||
int i, j; | |||
for (j = 0; j < *n; j++) { | |||
double *const A_j = A + *ldA * j; | |||
for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
A_j[i] = 0.; | |||
} | |||
// Allocate work space | |||
const int n1 = DREC_SPLIT(*n); | |||
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
const int nWorkl = (kv > n1) ? n1 : kv; | |||
const int mWorku = (*kl > n1) ? n1 : *kl; | |||
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
double *Workl = malloc(mWorkl * nWorkl * sizeof(double)); | |||
double *Worku = malloc(mWorku * nWorku * sizeof(double)); | |||
LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
// Recursive kernel | |||
RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
// Free work space | |||
free(Workl); | |||
free(Worku); | |||
} | |||
/** dgbtrf's recursive compute kernel */ | |||
static void RELAPACK_dgbtrf_rec( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
double *Ab, const int *ldAb, int *ipiv, | |||
double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { | |||
// Unblocked | |||
LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterators | |||
int i, j; | |||
// Output upper band width | |||
const int kv = *ku + *kl; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
double *const A = Ab + kv; | |||
// Splitting | |||
const int n1 = MIN(DREC_SPLIT(*n), *kl); | |||
const int n2 = *n - n1; | |||
const int m1 = MIN(n1, *m); | |||
const int m2 = *m - m1; | |||
const int mn1 = MIN(m1, n1); | |||
const int mn2 = MIN(m2, n2); | |||
// Ab_L * | |||
// Ab_BR | |||
double *const Ab_L = Ab; | |||
double *const Ab_BR = Ab + *ldAb * n1; | |||
// A_L A_R | |||
double *const A_L = A; | |||
double *const A_R = A + *ldA * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + *ldA * n1; | |||
double *const A_BL = A + m1; | |||
double *const A_BR = A + *ldA * n1 + m1; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_T = ipiv; | |||
int *const ipiv_B = ipiv + n1; | |||
// Banded splitting | |||
const int n21 = MIN(n2, kv - n1); | |||
const int n22 = MIN(n2 - n21, n1); | |||
const int m21 = MIN(m2, *kl - m1); | |||
const int m22 = MIN(m2 - m21, m1); | |||
// n1 n21 n22 | |||
// m * A_Rl ARr | |||
double *const A_Rl = A_R; | |||
double *const A_Rr = A_R + *ldA * n21; | |||
// n1 n21 n22 | |||
// m1 * A_TRl A_TRr | |||
// m21 A_BLt A_BRtl A_BRtr | |||
// m22 A_BLb A_BRbl A_BRbr | |||
double *const A_TRl = A_TR; | |||
double *const A_TRr = A_TR + *ldA * n21; | |||
double *const A_BLt = A_BL; | |||
double *const A_BLb = A_BL + m21; | |||
double *const A_BRtl = A_BR; | |||
double *const A_BRtr = A_BR + *ldA * n21; | |||
double *const A_BRbl = A_BR + m21; | |||
double *const A_BRbr = A_BR + *ldA * n21 + m21; | |||
// recursion(Ab_L, ipiv_T) | |||
RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
// Workl = A_BLb | |||
LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
// partially redo swaps in A_L | |||
for (i = 0; i < mn1; i++) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
if (ip < *kl) | |||
BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
else | |||
BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
} | |||
} | |||
// apply pivots to A_Rl | |||
LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
// apply pivots to A_Rr columnwise | |||
for (j = 0; j < n22; j++) { | |||
double *const A_Rrj = A_Rr + *ldA * j; | |||
for (i = j; i < mn1; i++) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
const double tmp = A_Rrj[i]; | |||
A_Rrj[i] = A_Rr[ip]; | |||
A_Rrj[ip] = tmp; | |||
} | |||
} | |||
} | |||
// A_TRl = A_TL \ A_TRl | |||
BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
// Worku = A_TRr | |||
LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
// Worku = A_TL \ Worku | |||
BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
// A_TRr = Worku | |||
LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
// A_BRtl = A_BRtl - A_BLt * A_TRl | |||
BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
// A_BRbl = A_BRbl - Workl * A_TRl | |||
BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
// A_BRtr = A_BRtr - A_BLt * Worku | |||
BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
// A_BRbr = A_BRbr - Workl * Worku | |||
BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
// partially undo swaps in A_L | |||
for (i = mn1 - 1; i >= 0; i--) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
if (ip < *kl) | |||
BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
else | |||
BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
} | |||
} | |||
// recursion(Ab_BR, ipiv_B) | |||
RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
if (*info) | |||
*info += n1; | |||
// shift pivots | |||
for (i = 0; i < mn2; i++) | |||
ipiv_B[i] += n1; | |||
} |
@@ -0,0 +1,165 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_dgemmt_rec(const char *, const char *, const char *, | |||
const int *, const int *, const double *, const double *, const int *, | |||
const double *, const int *, const double *, double *, const int *); | |||
static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, | |||
const int *, const int *, const double *, const double *, const int *, | |||
const double *, const int *, const double *, double *, const int *); | |||
/** DGEMMT computes a matrix-matrix product with general matrices but updates | |||
* only the upper or lower triangular part of the result matrix. | |||
* | |||
* This routine performs the same operation as the BLAS routine | |||
* dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
* but only updates the triangular part of C specified by uplo: | |||
* If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
* otherwise the upper triangular part is updated. | |||
* */ | |||
void RELAPACK_dgemmt( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
#if HAVE_XGEMMT | |||
BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
return; | |||
#else | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
const int notransA = LAPACK(lsame)(transA, "N"); | |||
const int tranA = LAPACK(lsame)(transA, "T"); | |||
const int notransB = LAPACK(lsame)(transB, "N"); | |||
const int tranB = LAPACK(lsame)(transB, "T"); | |||
int info = 0; | |||
if (!lower && !upper) | |||
info = 1; | |||
else if (!tranA && !notransA) | |||
info = 2; | |||
else if (!tranB && !notransB) | |||
info = 3; | |||
else if (*n < 0) | |||
info = 4; | |||
else if (*k < 0) | |||
info = 5; | |||
else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
info = 8; | |||
else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
info = 10; | |||
else if (*ldC < MAX(1, *n)) | |||
info = 13; | |||
if (info) { | |||
LAPACK(xerbla)("DGEMMT", &info); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
const char cleantransA = notransA ? 'N' : 'T'; | |||
const char cleantransB = notransB ? 'N' : 'T'; | |||
// Recursive kernel | |||
RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
#endif | |||
} | |||
/** dgemmt's recursive compute kernel */ | |||
static void RELAPACK_dgemmt_rec( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
if (*n <= MAX(CROSSOVER_DGEMMT, 1)) { | |||
// Unblocked | |||
RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
return; | |||
} | |||
// Splitting | |||
const int n1 = DREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_T | |||
// A_B | |||
const double *const A_T = A; | |||
const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1); | |||
// B_L B_R | |||
const double *const B_L = B; | |||
const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1); | |||
// C_TL C_TR | |||
// C_BL C_BR | |||
double *const C_TL = C; | |||
double *const C_TR = C + *ldC * n1; | |||
double *const C_BL = C + n1; | |||
double *const C_BR = C + *ldC * n1 + n1; | |||
// recursion(C_TL) | |||
RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
if (*uplo == 'L') | |||
// C_BL = alpha A_B B_L + beta C_BL | |||
BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
else | |||
// C_TR = alpha A_T B_R + beta C_TR | |||
BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
// recursion(C_BR) | |||
RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
} | |||
/** dgemmt's unblocked compute kernel */ | |||
static void RELAPACK_dgemmt_rec2( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
const int incB = (*transB == 'N') ? 1 : *ldB; | |||
const int incC = 1; | |||
int i; | |||
for (i = 0; i < *n; i++) { | |||
// A_0 | |||
// A_i | |||
const double *const A_0 = A; | |||
const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i); | |||
// * B_i * | |||
const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i); | |||
// * C_0i * | |||
// * C_ii * | |||
double *const C_0i = C + *ldC * i; | |||
double *const C_ii = C + *ldC * i + i; | |||
if (*uplo == 'L') { | |||
const int nmi = *n - i; | |||
if (*transA == 'N') | |||
BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
else | |||
BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
} else { | |||
const int ip1 = i + 1; | |||
if (*transA == 'N') | |||
BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
else | |||
BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
} | |||
} | |||
} |
@@ -0,0 +1,117 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_dgetrf_rec(const int *, const int *, double *, | |||
const int *, int *, int *); | |||
/** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dgetrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html | |||
* */ | |||
void RELAPACK_dgetrf( | |||
const int *m, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
// Check arguments | |||
*info = 0; | |||
if (*m < 0) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DGETRF", &minfo); | |||
return; | |||
} | |||
const int sn = MIN(*m, *n); | |||
RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
// Right remainder | |||
if (*m < *n) { | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const int iONE[] = { 1. }; | |||
// Splitting | |||
const int rn = *n - *m; | |||
// A_L A_R | |||
const double *const A_L = A; | |||
double *const A_R = A + *ldA * *m; | |||
// A_R = apply(ipiv, A_R) | |||
LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
// A_R = A_S \ A_R | |||
BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
} | |||
} | |||
/** dgetrf's recursive compute kernel */ | |||
static void RELAPACK_dgetrf_rec( | |||
const int *m, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_DGETRF, 1)) { | |||
// Unblocked | |||
LAPACK(dgetf2)(m, n, A, ldA, ipiv, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
// Splitting | |||
const int n1 = DREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
const int m2 = *m - n1; | |||
// A_L A_R | |||
double *const A_L = A; | |||
double *const A_R = A + *ldA * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + *ldA * n1; | |||
double *const A_BL = A + n1; | |||
double *const A_BR = A + *ldA * n1 + n1; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_T = ipiv; | |||
int *const ipiv_B = ipiv + n1; | |||
// recursion(A_L, ipiv_T) | |||
RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
// apply pivots to A_R | |||
LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
// A_TR = A_TL \ A_TR | |||
BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
// A_BR = A_BR - A_BL * A_TR | |||
BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
// recursion(A_BR, ipiv_B) | |||
RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
if (*info) | |||
*info += n1; | |||
// apply pivots to A_BL | |||
LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
ipiv_B[i] += n1; | |||
} |
@@ -0,0 +1,87 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_dlauum_rec(const char *, const int *, double *, | |||
const int *, int *); | |||
/** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dlauum. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html | |||
* */ | |||
void RELAPACK_dlauum( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DLAUUM", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Recursive kernel | |||
RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info); | |||
} | |||
/** dlauum's recursive compute kernel */ | |||
static void RELAPACK_dlauum_rec( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_DLAUUM, 1)) { | |||
// Unblocked | |||
LAPACK(dlauu2)(uplo, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
// Splitting | |||
const int n1 = DREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + *ldA * n1; | |||
double *const A_BL = A + n1; | |||
double *const A_BR = A + *ldA * n1 + n1; | |||
// recursion(A_TL) | |||
RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info); | |||
if (*uplo == 'L') { | |||
// A_TL = A_TL + A_BL' * A_BL | |||
BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
// A_BL = A_BR' * A_BL | |||
BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
} else { | |||
// A_TL = A_TL + A_TR * A_TR' | |||
BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
// A_TR = A_TR * A_BR' | |||
BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info); | |||
} |
@@ -0,0 +1,157 @@ | |||
#include "relapack.h" | |||
#include "stdlib.h" | |||
static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, | |||
double *, const int *, double *, const int *, int *); | |||
/** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dpbtrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html | |||
* */ | |||
void RELAPACK_dpbtrf( | |||
const char *uplo, const int *n, const int *kd, | |||
double *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*kd < 0) | |||
*info = -3; | |||
else if (*ldAb < *kd + 1) | |||
*info = -5; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DPBTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Constant | |||
const double ZERO[] = { 0. }; | |||
// Allocate work space | |||
const int n1 = DREC_SPLIT(*n); | |||
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
double *Work = malloc(mWork * nWork * sizeof(double)); | |||
LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
// Recursive kernel | |||
RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
// Free work space | |||
free(Work); | |||
} | |||
/** dpbtrf's recursive compute kernel */ | |||
static void RELAPACK_dpbtrf_rec( | |||
const char *uplo, const int *n, const int *kd, | |||
double *Ab, const int *ldAb, | |||
double *Work, const int *ldWork, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { | |||
// Unblocked | |||
LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
double *const A = Ab + ((*uplo == 'L') ? 0 : *kd); | |||
// Splitting | |||
const int n1 = MIN(DREC_SPLIT(*n), *kd); | |||
const int n2 = *n - n1; | |||
// * * | |||
// * Ab_BR | |||
double *const Ab_BR = Ab + *ldAb * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + *ldA * n1; | |||
double *const A_BL = A + n1; | |||
double *const A_BR = A + *ldA * n1 + n1; | |||
// recursion(A_TL) | |||
RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
// Banded splitting | |||
const int n21 = MIN(n2, *kd - n1); | |||
const int n22 = MIN(n2 - n21, n1); | |||
// n1 n21 n22 | |||
// n1 * A_TRl A_TRr | |||
// n21 A_BLt A_BRtl A_BRtr | |||
// n22 A_BLb A_BRbl A_BRbr | |||
double *const A_TRl = A_TR; | |||
double *const A_TRr = A_TR + *ldA * n21; | |||
double *const A_BLt = A_BL; | |||
double *const A_BLb = A_BL + n21; | |||
double *const A_BRtl = A_BR; | |||
double *const A_BRtr = A_BR + *ldA * n21; | |||
double *const A_BRbl = A_BR + n21; | |||
double *const A_BRbr = A_BR + *ldA * n21 + n21; | |||
if (*uplo == 'L') { | |||
// A_BLt = ABLt / A_TL' | |||
BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
// A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
// Work = A_BLb | |||
LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
// Work = Work / A_TL' | |||
BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
// A_BRbl = A_BRbl - Work * A_BLt' | |||
BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
// A_BRbr = A_BRbr - Work * Work' | |||
BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
// A_BLb = Work | |||
LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
} else { | |||
// A_TRl = A_TL' \ A_TRl | |||
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
// A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
// Work = A_TRr | |||
LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
// Work = A_TL' \ Work | |||
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
// A_BRtr = A_BRtr - A_TRl' * Work | |||
BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
// A_BRbr = A_BRbr - Work' * Work | |||
BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
// A_TRr = Work | |||
LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
} | |||
// recursion(A_BR) | |||
if (*kd > n1) | |||
RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info); | |||
else | |||
RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,92 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_dpotrf_rec(const char *, const int *, double *, | |||
const int *, int *); | |||
/** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dpotrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html | |||
* */ | |||
void RELAPACK_dpotrf( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DPOTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Recursive kernel | |||
RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info); | |||
} | |||
/** dpotrf's recursive compute kernel */ | |||
static void RELAPACK_dpotrf_rec( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_DPOTRF, 1)) { | |||
// Unblocked | |||
LAPACK(dpotf2)(uplo, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
// Splitting | |||
const int n1 = DREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + *ldA * n1; | |||
double *const A_BL = A + n1; | |||
double *const A_BR = A + *ldA * n1 + n1; | |||
// recursion(A_TL) | |||
RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL / A_TL' | |||
BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
// A_BR = A_BR - A_BL * A_BL' | |||
BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
} else { | |||
// A_TR = A_TL' \ A_TR | |||
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
// A_BR = A_BR - A_TR' * A_TR | |||
BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,212 @@ | |||
#include "relapack.h" | |||
#if XSYGST_ALLOW_MALLOC | |||
#include "stdlib.h" | |||
#endif | |||
static void RELAPACK_dsygst_rec(const int *, const char *, const int *, | |||
double *, const int *, const double *, const int *, | |||
double *, const int *, int *); | |||
/** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dsygst. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html | |||
* */ | |||
void RELAPACK_dsygst( | |||
const int *itype, const char *uplo, const int *n, | |||
double *A, const int *ldA, const double *B, const int *ldB, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (*itype < 1 || *itype > 3) | |||
*info = -1; | |||
else if (!lower && !upper) | |||
*info = -2; | |||
else if (*n < 0) | |||
*info = -3; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -5; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -7; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DSYGST", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Allocate work space | |||
double *Work = NULL; | |||
int lWork = 0; | |||
#if XSYGST_ALLOW_MALLOC | |||
const int n1 = DREC_SPLIT(*n); | |||
lWork = n1 * (*n - n1); | |||
Work = malloc(lWork * sizeof(double)); | |||
if (!Work) | |||
lWork = 0; | |||
#endif | |||
// recursive kernel | |||
RELAPACK_dsygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); | |||
// Free work space | |||
#if XSYGST_ALLOW_MALLOC | |||
if (Work) | |||
free(Work); | |||
#endif | |||
} | |||
/** dsygst's recursive compute kernel */ | |||
static void RELAPACK_dsygst_rec( | |||
const int *itype, const char *uplo, const int *n, | |||
double *A, const int *ldA, const double *B, const int *ldB, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_SSYGST, 1)) { | |||
// Unblocked | |||
LAPACK(dsygs2)(itype, uplo, n, A, ldA, B, ldB, info); | |||
return; | |||
} | |||
// Constants | |||
const double ZERO[] = { 0. }; | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
const double HALF[] = { .5 }; | |||
const double MHALF[] = { -.5 }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterator | |||
int i; | |||
// Splitting | |||
const int n1 = DREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + *ldA * n1; | |||
double *const A_BL = A + n1; | |||
double *const A_BR = A + *ldA * n1 + n1; | |||
// B_TL B_TR | |||
// B_BL B_BR | |||
const double *const B_TL = B; | |||
const double *const B_TR = B + *ldB * n1; | |||
const double *const B_BL = B + n1; | |||
const double *const B_BR = B + *ldB * n1 + n1; | |||
// recursion(A_TL, B_TL) | |||
RELAPACK_dsygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); | |||
if (*itype == 1) | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL / B_TL' | |||
BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = -1/2 * B_BL * A_TL | |||
BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
} else | |||
// A_BL = A_BL - 1/2 B_BL * A_TL | |||
BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' | |||
BLAS(dsyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
else | |||
// A_BL = A_BL - 1/2 B_BL * A_TL | |||
BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BL = B_BR \ A_BL | |||
BLAS(dtrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
} else { | |||
// A_TR = B_TL' \ A_TR | |||
BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = -1/2 * A_TL * B_TR | |||
BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
// A_TR = A_BL + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
} else | |||
// A_TR = A_TR - 1/2 A_TL * B_TR | |||
BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR | |||
BLAS(dsyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_TR = A_BL + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
else | |||
// A_TR = A_TR - 1/2 A_TL * B_TR | |||
BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TR = A_TR / B_BR | |||
BLAS(dtrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
} | |||
else | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL * B_TL | |||
BLAS(dtrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = 1/2 * A_BR * B_BL | |||
BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
} else | |||
// A_BL = A_BL + 1/2 A_BR * B_BL | |||
BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL | |||
BLAS(dsyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
else | |||
// A_BL = A_BL + 1/2 A_BR * B_BL | |||
BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BL = B_BR * A_BL | |||
BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
} else { | |||
// A_TR = B_TL * A_TR | |||
BLAS(dtrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = 1/2 * B_TR * A_BR | |||
BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
// A_TR = A_TR + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
} else | |||
// A_TR = A_TR + 1/2 B_TR A_BR | |||
BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' | |||
BLAS(dsyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_TR = A_TR + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
else | |||
// A_TR = A_TR + 1/2 B_TR * A_BR | |||
BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TR = A_TR * B_BR | |||
BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
} | |||
// recursion(A_BR, B_BR) | |||
RELAPACK_dsygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); | |||
} |
@@ -0,0 +1,238 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *, | |||
double *, const int *, int *, double *, const int *, int *); | |||
/** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dsytrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html | |||
* */ | |||
void RELAPACK_dsytrf( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
double *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * sizeof(double)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DSYTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy arguments | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** dsytrf's recursive compute kernel */ | |||
static void RELAPACK_dsytrf_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_DSYTRF, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_dsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterator | |||
int i; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = DREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
double *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
double *const A_BL = A + n1; | |||
double *const A_BR = A + *ldA * n1 + n1; | |||
double *const A_BL_B = A + *n; | |||
double *const A_BR_B = A + *ldA * n1 + *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
double *const Work_BL = Work + n1; | |||
double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; | |||
// last row of A_BL | |||
double *const A_BL_b = A_BL + n2_out; | |||
// last row of Work_BL | |||
double *const Work_BL_b = Work_BL + n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = DREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
double *const Work_R = top ? Work : Work + *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
double *const A_TL_T = A + *ldA * n_rest; | |||
double *const A_TR_T = A + *ldA * (n_rest + n1); | |||
double *const A_TL = A + *ldA * n_rest + n_rest; | |||
double *const A_TR = A + *ldA * (n_rest + n1) + n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
double *const Work_L = Work; | |||
double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,352 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static int c__1 = 1; | |||
static double c_b8 = -1.; | |||
static double c_b9 = 1.; | |||
/** DSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method. | |||
* | |||
* This routine is a minor modification of LAPACK's dlasyf. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int * | |||
nb, int *kb, double *a, int *lda, int *ipiv, | |||
double *w, int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; | |||
double d__1, d__2, d__3; | |||
/* Builtin functions */ | |||
double sqrt(double); | |||
/* Local variables */ | |||
static int j, k; | |||
static double t, r1, d11, d21, d22; | |||
static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
static double alpha; | |||
extern /* Subroutine */ int dscal_(int *, double *, double *, | |||
int *); | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
extern /* Subroutine */ int dgemv_(char *, int *, int *, | |||
double *, double *, int *, double *, int *, | |||
double *, double *, int *, ftnlen), dcopy_(int *, | |||
double *, int *, double *, int *), dswap_(int | |||
*, double *, int *, double *, int *); | |||
static int kstep; | |||
static double absakk; | |||
extern int idamax_(int *, double *, int *); | |||
static double colmax, rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.) + 1.) / 8.; | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
} | |||
kstep = 1; | |||
absakk = (d__1 = w[k + kw * w_dim1], abs(d__1)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = k - imax; | |||
dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
i__1 = k - imax; | |||
jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
&c__1); | |||
rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1)); | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
jmax = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
/* Computing MAX */ | |||
d__2 = rowmax, d__3 = (d__1 = w[jmax + (kw - 1) * w_dim1], | |||
abs(d__1)); | |||
rowmax = max(d__2,d__3); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else if ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) >= | |||
alpha * rowmax) { | |||
kp = imax; | |||
dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kp != kk) { | |||
a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; | |||
i__1 = kk - 1 - kp; | |||
dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
if (kp > 1) { | |||
i__1 = kp - 1; | |||
dcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
+ 1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
dswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
+ 1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
r1 = 1. / a[k + k * a_dim1]; | |||
i__1 = k - 1; | |||
dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
if (k > 2) { | |||
d21 = w[k - 1 + kw * w_dim1]; | |||
d11 = w[k + kw * w_dim1] / d21; | |||
d22 = w[k - 1 + (kw - 1) * w_dim1] / d21; | |||
t = 1. / (d11 * d22 - 1.); | |||
d21 = t / d21; | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1) | |||
* w_dim1] - w[j + kw * w_dim1]); | |||
a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] - | |||
w[j + (kw - 1) * w_dim1]); | |||
/* L20: */ | |||
} | |||
} | |||
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; | |||
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; | |||
a[k + k * a_dim1] = w[k + kw * w_dim1]; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
++j; | |||
} | |||
++j; | |||
if (jp != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
dswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
} | |||
if (j < *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
i__1 = *n - k + 1; | |||
dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k | |||
+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
kstep = 1; | |||
absakk = (d__1 = w[k + k * w_dim1], abs(d__1)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
colmax = (d__1 = w[imax + k * w_dim1], abs(d__1)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
i__1 = imax - k; | |||
dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = *n - imax + 1; | |||
dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
1) * w_dim1], &c__1); | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], | |||
lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) * | |||
w_dim1], &c__1, (ftnlen)12); | |||
i__1 = imax - k; | |||
jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
; | |||
rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1)); | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
jmax = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
/* Computing MAX */ | |||
d__2 = rowmax, d__3 = (d__1 = w[jmax + (k + 1) * w_dim1], | |||
abs(d__1)); | |||
rowmax = max(d__2,d__3); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else if ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) >= | |||
alpha * rowmax) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kp != kk) { | |||
a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; | |||
i__1 = kp - kk - 1; | |||
dcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
1) * a_dim1], lda); | |||
if (kp < *n) { | |||
i__1 = *n - kp; | |||
dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
+ kp * a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
dswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
} | |||
dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
r1 = 1. / a[k + k * a_dim1]; | |||
i__1 = *n - k; | |||
dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
d21 = w[k + 1 + k * w_dim1]; | |||
d11 = w[k + 1 + (k + 1) * w_dim1] / d21; | |||
d22 = w[k + k * w_dim1] / d21; | |||
t = 1. / (d11 * d22 - 1.); | |||
d21 = t / d21; | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] - | |||
w[j + (k + 1) * w_dim1]); | |||
a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) * | |||
w_dim1] - w[j + k * w_dim1]); | |||
/* L80: */ | |||
} | |||
} | |||
a[k + k * a_dim1] = w[k + k * w_dim1]; | |||
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; | |||
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
--j; | |||
} | |||
--j; | |||
if (jp != jj && j >= 1) { | |||
dswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j > 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,236 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *, | |||
double *, const int *, int *, double *, const int *, int *); | |||
/** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dsytrf_rook. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html | |||
* */ | |||
void RELAPACK_dsytrf_rook( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
double *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * sizeof(double)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DSYTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy argument | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** dsytrf_rook's recursive compute kernel */ | |||
static void RELAPACK_dsytrf_rook_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_dsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = DREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
double *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
double *const A_BL = A + n1; | |||
double *const A_BR = A + *ldA * n1 + n1; | |||
double *const A_BL_B = A + *n; | |||
double *const A_BR_B = A + *ldA * n1 + *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
double *const Work_BL = Work + n1; | |||
double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; | |||
// last row of A_BL | |||
double *const A_BL_b = A_BL + n2_out; | |||
// last row of Work_BL | |||
double *const Work_BL_b = Work_BL + n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = DREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
double *const Work_R = top ? Work : Work + *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
double *const A_TL_T = A + *ldA * n_rest; | |||
double *const A_TR_T = A + *ldA * (n_rest + n1); | |||
double *const A_TL = A + *ldA * n_rest + n_rest; | |||
double *const A_TR = A + *ldA * (n_rest + n1) + n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
double *const Work_L = Work; | |||
double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,451 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static int c__1 = 1; | |||
static double c_b9 = -1.; | |||
static double c_b10 = 1.; | |||
/** DSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method. | |||
* | |||
* This routine is a minor modification of LAPACK's dlasyf. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n, | |||
int *nb, int *kb, double *a, int *lda, int *ipiv, | |||
double *w, int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; | |||
double d__1; | |||
/* Builtin functions */ | |||
double sqrt(double); | |||
/* Local variables */ | |||
static int j, k, p; | |||
static double t, r1, d11, d12, d21, d22; | |||
static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
static logical done; | |||
static int imax, jmax; | |||
static double alpha; | |||
extern /* Subroutine */ int dscal_(int *, double *, double *, | |||
int *); | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
extern /* Subroutine */ int dgemv_(char *, int *, int *, | |||
double *, double *, int *, double *, int *, | |||
double *, double *, int *, ftnlen); | |||
static double dtemp, sfmin; | |||
static int itemp; | |||
extern /* Subroutine */ int dcopy_(int *, double *, int *, | |||
double *, int *), dswap_(int *, double *, int | |||
*, double *, int *); | |||
static int kstep; | |||
extern double dlamch_(char *, ftnlen); | |||
static double absakk; | |||
extern int idamax_(int *, double *, int *); | |||
static double colmax, rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.) + 1.) / 8.; | |||
sfmin = dlamch_("S", (ftnlen)1); | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
kstep = 1; | |||
p = k; | |||
dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
} | |||
absakk = (d__1 = w[k + kw * w_dim1], abs(d__1)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L12: | |||
dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = k - imax; | |||
dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
if (imax != k) { | |||
i__1 = k - imax; | |||
jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
w_dim1], &c__1); | |||
rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1)); | |||
} else { | |||
rowmax = 0.; | |||
} | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
itemp = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
dtemp = (d__1 = w[itemp + (kw - 1) * w_dim1], abs(d__1)); | |||
if (dtemp > rowmax) { | |||
rowmax = dtemp; | |||
jmax = itemp; | |||
} | |||
} | |||
if (! ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) < | |||
alpha * rowmax)) { | |||
kp = imax; | |||
dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} | |||
if (! done) { | |||
goto L12; | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kstep == 2 && p != k) { | |||
i__1 = k - p; | |||
dcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
a_dim1], lda); | |||
dcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & | |||
c__1); | |||
i__1 = *n - k + 1; | |||
dswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], | |||
lda); | |||
i__1 = *n - kk + 1; | |||
dswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
ldw); | |||
} | |||
if (kp != kk) { | |||
a[kp + k * a_dim1] = a[kk + k * a_dim1]; | |||
i__1 = k - 1 - kp; | |||
dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
dcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & | |||
c__1); | |||
i__1 = *n - kk + 1; | |||
dswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], | |||
lda); | |||
i__1 = *n - kk + 1; | |||
dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
if (k > 1) { | |||
if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) { | |||
r1 = 1. / a[k + k * a_dim1]; | |||
i__1 = k - 1; | |||
dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else if (a[k + k * a_dim1] != 0.) { | |||
i__1 = k - 1; | |||
for (ii = 1; ii <= i__1; ++ii) { | |||
a[ii + k * a_dim1] /= a[k + k * a_dim1]; | |||
/* L14: */ | |||
} | |||
} | |||
} | |||
} else { | |||
if (k > 2) { | |||
d12 = w[k - 1 + kw * w_dim1]; | |||
d11 = w[k + kw * w_dim1] / d12; | |||
d22 = w[k - 1 + (kw - 1) * w_dim1] / d12; | |||
t = 1. / (d11 * d22 - 1.); | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) * | |||
w_dim1] - w[j + kw * w_dim1]) / d12); | |||
a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] - | |||
w[j + (kw - 1) * w_dim1]) / d12); | |||
/* L20: */ | |||
} | |||
} | |||
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; | |||
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; | |||
a[k + k * a_dim1] = w[k + kw * w_dim1]; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
++j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
++j; | |||
if (jp2 != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
dswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
jj = j - 1; | |||
if (jp1 != jj && kstep == 2) { | |||
i__1 = *n - j + 1; | |||
dswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
if (j <= *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
kstep = 1; | |||
p = k; | |||
i__1 = *n - k + 1; | |||
dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, & | |||
w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
absakk = (d__1 = w[k + k * w_dim1], abs(d__1)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
colmax = (d__1 = w[imax + k * w_dim1], abs(d__1)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = *n - k + 1; | |||
dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L72: | |||
i__1 = imax - k; | |||
dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = *n - imax + 1; | |||
dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
1) * w_dim1], &c__1); | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1] | |||
, lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k | |||
+ 1) * w_dim1], &c__1, (ftnlen)12); | |||
} | |||
if (imax != k) { | |||
i__1 = imax - k; | |||
jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
c__1); | |||
rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1)); | |||
} else { | |||
rowmax = 0.; | |||
} | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
itemp = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
dtemp = (d__1 = w[itemp + (k + 1) * w_dim1], abs(d__1)); | |||
if (dtemp > rowmax) { | |||
rowmax = dtemp; | |||
jmax = itemp; | |||
} | |||
} | |||
if (! ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) < alpha | |||
* rowmax)) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
i__1 = *n - k + 1; | |||
dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
} | |||
if (! done) { | |||
goto L72; | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kstep == 2 && p != k) { | |||
i__1 = p - k; | |||
dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], | |||
lda); | |||
i__1 = *n - p + 1; | |||
dcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & | |||
c__1); | |||
dswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
dswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
} | |||
if (kp != kk) { | |||
a[kp + k * a_dim1] = a[kk + k * a_dim1]; | |||
i__1 = kp - k - 1; | |||
dcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) | |||
* a_dim1], lda); | |||
i__1 = *n - kp + 1; | |||
dcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * | |||
a_dim1], &c__1); | |||
dswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) { | |||
r1 = 1. / a[k + k * a_dim1]; | |||
i__1 = *n - k; | |||
dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} else if (a[k + k * a_dim1] != 0.) { | |||
i__1 = *n; | |||
for (ii = k + 1; ii <= i__1; ++ii) { | |||
a[ii + k * a_dim1] /= a[k + k * a_dim1]; | |||
/* L74: */ | |||
} | |||
} | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
d21 = w[k + 1 + k * w_dim1]; | |||
d11 = w[k + 1 + (k + 1) * w_dim1] / d21; | |||
d22 = w[k + k * w_dim1] / d21; | |||
t = 1. / (d11 * d22 - 1.); | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[ | |||
j + (k + 1) * w_dim1]) / d21); | |||
a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) * | |||
w_dim1] - w[j + k * w_dim1]) / d21); | |||
/* L80: */ | |||
} | |||
} | |||
a[k + k * a_dim1] = w[k + k * w_dim1]; | |||
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; | |||
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
--j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
--j; | |||
if (jp2 != jj && j >= 1) { | |||
dswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
jj = j + 1; | |||
if (jp1 != jj && kstep == 2) { | |||
dswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j >= 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,274 @@ | |||
#include "relapack.h" | |||
#include <math.h> | |||
static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *, | |||
const int *, const double *, const int *, const double *, const int *, | |||
double *, const int *, const double *, const int *, const double *, | |||
const int *, double *, const int *, double *, double *, double *, int *, | |||
int *, int *); | |||
/** DTGSYL solves the generalized Sylvester equation. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dtgsyl. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html | |||
* */ | |||
void RELAPACK_dtgsyl( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, | |||
const double *D, const int *ldD, const double *E, const int *ldE, | |||
double *F, const int *ldF, | |||
double *scale, double *dif, | |||
double *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
// Parse arguments | |||
const int notran = LAPACK(lsame)(trans, "N"); | |||
const int tran = LAPACK(lsame)(trans, "T"); | |||
// Compute work buffer size | |||
int lwmin = 1; | |||
if (notran && (*ijob == 1 || *ijob == 2)) | |||
lwmin = MAX(1, 2 * *m * *n); | |||
*info = 0; | |||
// Check arguments | |||
if (!tran && !notran) | |||
*info = -1; | |||
else if (notran && (*ijob < 0 || *ijob > 4)) | |||
*info = -2; | |||
else if (*m <= 0) | |||
*info = -3; | |||
else if (*n <= 0) | |||
*info = -4; | |||
else if (*ldA < MAX(1, *m)) | |||
*info = -6; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -8; | |||
else if (*ldC < MAX(1, *m)) | |||
*info = -10; | |||
else if (*ldD < MAX(1, *m)) | |||
*info = -12; | |||
else if (*ldE < MAX(1, *n)) | |||
*info = -14; | |||
else if (*ldF < MAX(1, *m)) | |||
*info = -16; | |||
else if (*lWork < lwmin && *lWork != -1) | |||
*info = -20; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DTGSYL", &minfo); | |||
return; | |||
} | |||
if (*lWork == -1) { | |||
// Work size query | |||
*Work = lwmin; | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleantrans = notran ? 'N' : 'T'; | |||
// Constant | |||
const double ZERO[] = { 0. }; | |||
int isolve = 1; | |||
int ifunc = 0; | |||
if (notran) { | |||
if (*ijob >= 3) { | |||
ifunc = *ijob - 2; | |||
LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
} else if (*ijob >= 1) | |||
isolve = 2; | |||
} | |||
double scale2; | |||
int iround; | |||
for (iround = 1; iround <= isolve; iround++) { | |||
*scale = 1; | |||
double dscale = 0; | |||
double dsum = 1; | |||
int pq; | |||
RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); | |||
if (dscale != 0) { | |||
if (*ijob == 1 || *ijob == 3) | |||
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); | |||
else | |||
*dif = sqrt(pq) / (dscale * sqrt(dsum)); | |||
} | |||
if (isolve == 2) { | |||
if (iround == 1) { | |||
if (notran) | |||
ifunc = *ijob; | |||
scale2 = *scale; | |||
LAPACK(dlacpy)("F", m, n, C, ldC, Work, m); | |||
LAPACK(dlacpy)("F", m, n, F, ldF, Work + *m * *n, m); | |||
LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
} else { | |||
LAPACK(dlacpy)("F", m, n, Work, m, C, ldC); | |||
LAPACK(dlacpy)("F", m, n, Work + *m * *n, m, F, ldF); | |||
*scale = scale2; | |||
} | |||
} | |||
} | |||
} | |||
/** dtgsyl's recursive vompute kernel */ | |||
static void RELAPACK_dtgsyl_rec( | |||
const char *trans, const int *ifunc, const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, | |||
const double *D, const int *ldD, const double *E, const int *ldE, | |||
double *F, const int *ldF, | |||
double *scale, double *dsum, double *dscale, | |||
int *iWork, int *pq, int *info | |||
) { | |||
if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) { | |||
// Unblocked | |||
LAPACK(dtgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
// Outputs | |||
double scale1[] = { 1. }; | |||
double scale2[] = { 1. }; | |||
int info1[] = { 0 }; | |||
int info2[] = { 0 }; | |||
if (*m > *n) { | |||
// Splitting | |||
int m1 = DREC_SPLIT(*m); | |||
if (A[m1 + *ldA * (m1 - 1)]) | |||
m1++; | |||
const int m2 = *m - m1; | |||
// A_TL A_TR | |||
// 0 A_BR | |||
const double *const A_TL = A; | |||
const double *const A_TR = A + *ldA * m1; | |||
const double *const A_BR = A + *ldA * m1 + m1; | |||
// C_T | |||
// C_B | |||
double *const C_T = C; | |||
double *const C_B = C + m1; | |||
// D_TL D_TR | |||
// 0 D_BR | |||
const double *const D_TL = D; | |||
const double *const D_TR = D + *ldD * m1; | |||
const double *const D_BR = D + *ldD * m1 + m1; | |||
// F_T | |||
// F_B | |||
double *const F_T = F; | |||
double *const F_B = F + m1; | |||
if (*trans == 'N') { | |||
// recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
// C_T = C_T - A_TR * C_B | |||
BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
// F_T = F_T - D_TR * C_B | |||
BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); | |||
// recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); | |||
} | |||
} else { | |||
// recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
// apply scale | |||
if (scale1[0] != 1) | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); | |||
// C_B = C_B - A_TR^H * C_T | |||
BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
// C_B = C_B - D_TR^H * F_T | |||
BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); | |||
// recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); | |||
} | |||
} | |||
} else { | |||
// Splitting | |||
int n1 = DREC_SPLIT(*n); | |||
if (B[n1 + *ldB * (n1 - 1)]) | |||
n1++; | |||
const int n2 = *n - n1; | |||
// B_TL B_TR | |||
// 0 B_BR | |||
const double *const B_TL = B; | |||
const double *const B_TR = B + *ldB * n1; | |||
const double *const B_BR = B + *ldB * n1 + n1; | |||
// C_L C_R | |||
double *const C_L = C; | |||
double *const C_R = C + *ldC * n1; | |||
// E_TL E_TR | |||
// 0 E_BR | |||
const double *const E_TL = E; | |||
const double *const E_TR = E + *ldE * n1; | |||
const double *const E_BR = E + *ldE * n1 + n1; | |||
// F_L F_R | |||
double *const F_L = F; | |||
double *const F_R = F + *ldF * n1; | |||
if (*trans == 'N') { | |||
// recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
// C_R = C_R + F_L * B_TR | |||
BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); | |||
// F_R = F_R + F_L * E_TR | |||
BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); | |||
// recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); | |||
} | |||
} else { | |||
// recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
// apply scale | |||
if (scale1[0] != 1) | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); | |||
// F_L = F_L + C_R * B_TR | |||
BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); | |||
// F_L = F_L + F_R * E_TR | |||
BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); | |||
// recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); | |||
} | |||
} | |||
} | |||
*scale = scale1[0] * scale2[0]; | |||
*info = info1[0] || info2[0]; | |||
} |
@@ -0,0 +1,169 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *, | |||
const int *, const int *, const double *, const int *, const double *, | |||
const int *, double *, const int *, double *, int *); | |||
/** DTRSYL solves the real Sylvester matrix equation. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dtrsyl. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html | |||
* */ | |||
void RELAPACK_dtrsyl( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, double *scale, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int notransA = LAPACK(lsame)(tranA, "N"); | |||
const int transA = LAPACK(lsame)(tranA, "T"); | |||
const int ctransA = LAPACK(lsame)(tranA, "C"); | |||
const int notransB = LAPACK(lsame)(tranB, "N"); | |||
const int transB = LAPACK(lsame)(tranB, "T"); | |||
const int ctransB = LAPACK(lsame)(tranB, "C"); | |||
*info = 0; | |||
if (!transA && !ctransA && !notransA) | |||
*info = -1; | |||
else if (!transB && !ctransB && !notransB) | |||
*info = -2; | |||
else if (*isgn != 1 && *isgn != -1) | |||
*info = -3; | |||
else if (*m < 0) | |||
*info = -4; | |||
else if (*n < 0) | |||
*info = -5; | |||
else if (*ldA < MAX(1, *m)) | |||
*info = -7; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -9; | |||
else if (*ldC < MAX(1, *m)) | |||
*info = -11; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DTRSYL", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); | |||
const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); | |||
// Recursive kernel | |||
RELAPACK_dtrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
/** dtrsyl's recursive compute kernel */ | |||
static void RELAPACK_dtrsyl_rec( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, double *scale, | |||
int *info | |||
) { | |||
if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) { | |||
// Unblocked | |||
RELAPACK_dtrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
const double MSGN[] = { -*isgn }; | |||
const int iONE[] = { 1 }; | |||
// Outputs | |||
double scale1[] = { 1. }; | |||
double scale2[] = { 1. }; | |||
int info1[] = { 0 }; | |||
int info2[] = { 0 }; | |||
if (*m > *n) { | |||
// Splitting | |||
int m1 = DREC_SPLIT(*m); | |||
if (A[m1 + *ldA * (m1 - 1)]) | |||
m1++; | |||
const int m2 = *m - m1; | |||
// A_TL A_TR | |||
// 0 A_BR | |||
const double *const A_TL = A; | |||
const double *const A_TR = A + *ldA * m1; | |||
const double *const A_BR = A + *ldA * m1 + m1; | |||
// C_T | |||
// C_B | |||
double *const C_T = C; | |||
double *const C_B = C + m1; | |||
if (*tranA == 'N') { | |||
// recusion(A_BR, B, C_B) | |||
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); | |||
// C_T = C_T - A_TR * C_B | |||
BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
// recusion(A_TL, B, C_T) | |||
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
} else { | |||
// recusion(A_TL, B, C_T) | |||
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); | |||
// C_B = C_B - A_TR' * C_T | |||
BLAS(dgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
// recusion(A_BR, B, C_B) | |||
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); | |||
} | |||
} else { | |||
// Splitting | |||
int n1 = DREC_SPLIT(*n); | |||
if (B[n1 + *ldB * (n1 - 1)]) | |||
n1++; | |||
const int n2 = *n - n1; | |||
// B_TL B_TR | |||
// 0 B_BR | |||
const double *const B_TL = B; | |||
const double *const B_TR = B + *ldB * n1; | |||
const double *const B_BR = B + *ldB * n1 + n1; | |||
// C_L C_R | |||
double *const C_L = C; | |||
double *const C_R = C + *ldC * n1; | |||
if (*tranB == 'N') { | |||
// recusion(A, B_TL, C_L) | |||
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); | |||
// C_R = C_R -/+ C_L * B_TR | |||
BLAS(dgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); | |||
// recusion(A, B_BR, C_R) | |||
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
} else { | |||
// recusion(A, B_BR, C_R) | |||
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); | |||
// C_L = C_L -/+ C_R * B_TR' | |||
BLAS(dgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); | |||
// recusion(A, B_TL, C_L) | |||
RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
} | |||
} | |||
*scale = scale1[0] * scale2[0]; | |||
*info = info1[0] || info2[0]; | |||
} |
@@ -0,0 +1,107 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_dtrtri_rec(const char *, const char *, const int *, | |||
double *, const int *, int *); | |||
/** DTRTRI computes the inverse of a real upper or lower triangular matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's dtrtri. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html | |||
* */ | |||
void RELAPACK_dtrtri( | |||
const char *uplo, const char *diag, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
const int nounit = LAPACK(lsame)(diag, "N"); | |||
const int unit = LAPACK(lsame)(diag, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (!nounit && !unit) | |||
*info = -2; | |||
else if (*n < 0) | |||
*info = -3; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -5; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("DTRTRI", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
const char cleandiag = nounit ? 'N' : 'U'; | |||
// check for singularity | |||
if (nounit) { | |||
int i; | |||
for (i = 0; i < *n; i++) | |||
if (A[i + *ldA * i] == 0) { | |||
*info = i; | |||
return; | |||
} | |||
} | |||
// Recursive kernel | |||
RELAPACK_dtrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); | |||
} | |||
/** dtrtri's recursive compute kernel */ | |||
static void RELAPACK_dtrtri_rec( | |||
const char *uplo, const char *diag, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_DTRTRI, 1)) { | |||
// Unblocked | |||
LAPACK(dtrti2)(uplo, diag, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
// Splitting | |||
const int n1 = DREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + *ldA * n1; | |||
double *const A_BL = A + n1; | |||
double *const A_BR = A + *ldA * n1 + n1; | |||
// recursion(A_TL) | |||
RELAPACK_dtrtri_rec(uplo, diag, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
if (*uplo == 'L') { | |||
// A_BL = - A_BL * A_TL | |||
BLAS(dtrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); | |||
// A_BL = A_BR \ A_BL | |||
BLAS(dtrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
} else { | |||
// A_TR = - A_TL * A_TR | |||
BLAS(dtrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); | |||
// A_TR = A_TR / A_BR | |||
BLAS(dtrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_dtrtri_rec(uplo, diag, &n2, A_BR, ldA, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,109 @@ | |||
#include "stdlib.h" | |||
#include "stdio.h" | |||
#include "signal.h" | |||
#include "f2c.h" | |||
#ifndef SIGIOT | |||
#ifdef SIGABRT | |||
#define SIGIOT SIGABRT | |||
#endif | |||
#endif | |||
void sig_die(const char *s, int kill) { | |||
/* print error message, then clear buffers */ | |||
fprintf(stderr, "%s\n", s); | |||
if(kill) { | |||
fflush(stderr); | |||
/* now get a core */ | |||
signal(SIGIOT, SIG_DFL); | |||
abort(); | |||
} else | |||
exit(1); | |||
} | |||
void c_div(complex *c, complex *a, complex *b) { | |||
double ratio, den; | |||
double abr, abi, cr; | |||
if( (abr = b->r) < 0.) | |||
abr = - abr; | |||
if( (abi = b->i) < 0.) | |||
abi = - abi; | |||
if( abr <= abi ) { | |||
if(abi == 0) { | |||
#ifdef IEEE_COMPLEX_DIVIDE | |||
float af, bf; | |||
af = bf = abr; | |||
if (a->i != 0 || a->r != 0) | |||
af = 1.; | |||
c->i = c->r = af / bf; | |||
return; | |||
#else | |||
sig_die("complex division by zero", 1); | |||
#endif | |||
} | |||
ratio = (double)b->r / b->i ; | |||
den = b->i * (1 + ratio*ratio); | |||
cr = (a->r*ratio + a->i) / den; | |||
c->i = (a->i*ratio - a->r) / den; | |||
} else { | |||
ratio = (double)b->i / b->r ; | |||
den = b->r * (1 + ratio*ratio); | |||
cr = (a->r + a->i*ratio) / den; | |||
c->i = (a->i - a->r*ratio) / den; | |||
} | |||
c->r = cr; | |||
} | |||
void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) { | |||
double ratio, den; | |||
double abr, abi, cr; | |||
if( (abr = b->r) < 0.) | |||
abr = - abr; | |||
if( (abi = b->i) < 0.) | |||
abi = - abi; | |||
if( abr <= abi ) { | |||
if(abi == 0) { | |||
#ifdef IEEE_COMPLEX_DIVIDE | |||
if (a->i != 0 || a->r != 0) | |||
abi = 1.; | |||
c->i = c->r = abi / abr; | |||
return; | |||
#else | |||
sig_die("complex division by zero", 1); | |||
#endif | |||
} | |||
ratio = b->r / b->i ; | |||
den = b->i * (1 + ratio*ratio); | |||
cr = (a->r*ratio + a->i) / den; | |||
c->i = (a->i*ratio - a->r) / den; | |||
} else { | |||
ratio = b->i / b->r ; | |||
den = b->r * (1 + ratio*ratio); | |||
cr = (a->r + a->i*ratio) / den; | |||
c->i = (a->i - a->r*ratio) / den; | |||
} | |||
c->r = cr; | |||
} | |||
float r_imag(complex *z) { | |||
return z->i; | |||
} | |||
void r_cnjg(complex *r, complex *z) { | |||
float zi = z->i; | |||
r->r = z->r; | |||
r->i = -zi; | |||
} | |||
double d_imag(doublecomplex *z) { | |||
return z->i; | |||
} | |||
void d_cnjg(doublecomplex *r, doublecomplex *z) { | |||
double zi = z->i; | |||
r->r = z->r; | |||
r->i = -zi; | |||
} |
@@ -0,0 +1,223 @@ | |||
/* f2c.h -- Standard Fortran to C header file */ | |||
/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." | |||
- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ | |||
#ifndef F2C_INCLUDE | |||
#define F2C_INCLUDE | |||
typedef long int integer; | |||
typedef unsigned long int uinteger; | |||
typedef char *address; | |||
typedef short int shortint; | |||
typedef float real; | |||
typedef double doublereal; | |||
typedef struct { real r, i; } complex; | |||
typedef struct { doublereal r, i; } doublecomplex; | |||
typedef long int logical; | |||
typedef short int shortlogical; | |||
typedef char logical1; | |||
typedef char integer1; | |||
#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */ | |||
typedef long long longint; /* system-dependent */ | |||
typedef unsigned long long ulongint; /* system-dependent */ | |||
#define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) | |||
#define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) | |||
#endif | |||
#define TRUE_ (1) | |||
#define FALSE_ (0) | |||
/* Extern is for use with -E */ | |||
#ifndef Extern | |||
#define Extern extern | |||
#endif | |||
/* I/O stuff */ | |||
#ifdef f2c_i2 | |||
/* for -i2 */ | |||
typedef short flag; | |||
typedef short ftnlen; | |||
typedef short ftnint; | |||
#else | |||
typedef long int flag; | |||
typedef long int ftnlen; | |||
typedef long int ftnint; | |||
#endif | |||
/*external read, write*/ | |||
typedef struct | |||
{ flag cierr; | |||
ftnint ciunit; | |||
flag ciend; | |||
char *cifmt; | |||
ftnint cirec; | |||
} cilist; | |||
/*internal read, write*/ | |||
typedef struct | |||
{ flag icierr; | |||
char *iciunit; | |||
flag iciend; | |||
char *icifmt; | |||
ftnint icirlen; | |||
ftnint icirnum; | |||
} icilist; | |||
/*open*/ | |||
typedef struct | |||
{ flag oerr; | |||
ftnint ounit; | |||
char *ofnm; | |||
ftnlen ofnmlen; | |||
char *osta; | |||
char *oacc; | |||
char *ofm; | |||
ftnint orl; | |||
char *oblnk; | |||
} olist; | |||
/*close*/ | |||
typedef struct | |||
{ flag cerr; | |||
ftnint cunit; | |||
char *csta; | |||
} cllist; | |||
/*rewind, backspace, endfile*/ | |||
typedef struct | |||
{ flag aerr; | |||
ftnint aunit; | |||
} alist; | |||
/* inquire */ | |||
typedef struct | |||
{ flag inerr; | |||
ftnint inunit; | |||
char *infile; | |||
ftnlen infilen; | |||
ftnint *inex; /*parameters in standard's order*/ | |||
ftnint *inopen; | |||
ftnint *innum; | |||
ftnint *innamed; | |||
char *inname; | |||
ftnlen innamlen; | |||
char *inacc; | |||
ftnlen inacclen; | |||
char *inseq; | |||
ftnlen inseqlen; | |||
char *indir; | |||
ftnlen indirlen; | |||
char *infmt; | |||
ftnlen infmtlen; | |||
char *inform; | |||
ftnint informlen; | |||
char *inunf; | |||
ftnlen inunflen; | |||
ftnint *inrecl; | |||
ftnint *innrec; | |||
char *inblank; | |||
ftnlen inblanklen; | |||
} inlist; | |||
#define VOID void | |||
union Multitype { /* for multiple entry points */ | |||
integer1 g; | |||
shortint h; | |||
integer i; | |||
/* longint j; */ | |||
real r; | |||
doublereal d; | |||
complex c; | |||
doublecomplex z; | |||
}; | |||
typedef union Multitype Multitype; | |||
/*typedef long int Long;*/ /* No longer used; formerly in Namelist */ | |||
struct Vardesc { /* for Namelist */ | |||
char *name; | |||
char *addr; | |||
ftnlen *dims; | |||
int type; | |||
}; | |||
typedef struct Vardesc Vardesc; | |||
struct Namelist { | |||
char *name; | |||
Vardesc **vars; | |||
int nvars; | |||
}; | |||
typedef struct Namelist Namelist; | |||
#define abs(x) ((x) >= 0 ? (x) : -(x)) | |||
#define dabs(x) (doublereal)abs(x) | |||
#define min(a,b) ((a) <= (b) ? (a) : (b)) | |||
#define max(a,b) ((a) >= (b) ? (a) : (b)) | |||
#define dmin(a,b) (doublereal)min(a,b) | |||
#define dmax(a,b) (doublereal)max(a,b) | |||
#define bit_test(a,b) ((a) >> (b) & 1) | |||
#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) | |||
#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) | |||
/* procedure parameter types for -A and -C++ */ | |||
#define F2C_proc_par_types 1 | |||
#ifdef __cplusplus | |||
typedef int /* Unknown procedure type */ (*U_fp)(...); | |||
typedef shortint (*J_fp)(...); | |||
typedef integer (*I_fp)(...); | |||
typedef real (*R_fp)(...); | |||
typedef doublereal (*D_fp)(...), (*E_fp)(...); | |||
typedef /* Complex */ VOID (*C_fp)(...); | |||
typedef /* Double Complex */ VOID (*Z_fp)(...); | |||
typedef logical (*L_fp)(...); | |||
typedef shortlogical (*K_fp)(...); | |||
typedef /* Character */ VOID (*H_fp)(...); | |||
typedef /* Subroutine */ int (*S_fp)(...); | |||
#else | |||
typedef int /* Unknown procedure type */ (*U_fp)(); | |||
typedef shortint (*J_fp)(); | |||
typedef integer (*I_fp)(); | |||
typedef real (*R_fp)(); | |||
typedef doublereal (*D_fp)(), (*E_fp)(); | |||
typedef /* Complex */ VOID (*C_fp)(); | |||
typedef /* Double Complex */ VOID (*Z_fp)(); | |||
typedef logical (*L_fp)(); | |||
typedef shortlogical (*K_fp)(); | |||
typedef /* Character */ VOID (*H_fp)(); | |||
typedef /* Subroutine */ int (*S_fp)(); | |||
#endif | |||
/* E_fp is for real functions when -R is not specified */ | |||
typedef VOID C_f; /* complex function */ | |||
typedef VOID H_f; /* character function */ | |||
typedef VOID Z_f; /* double complex function */ | |||
typedef doublereal E_f; /* real function with -R not specified */ | |||
/* undef any lower-case symbols that your C compiler predefines, e.g.: */ | |||
#ifndef Skip_f2c_Undefs | |||
#undef cray | |||
#undef gcos | |||
#undef mc68010 | |||
#undef mc68020 | |||
#undef mips | |||
#undef pdp11 | |||
#undef sgi | |||
#undef sparc | |||
#undef sun | |||
#undef sun2 | |||
#undef sun3 | |||
#undef sun4 | |||
#undef u370 | |||
#undef u3b | |||
#undef u3b2 | |||
#undef u3b5 | |||
#undef unix | |||
#undef vax | |||
#endif | |||
#endif |
@@ -0,0 +1,80 @@ | |||
#ifndef LAPACK_H | |||
#define LAPACK_H | |||
extern int LAPACK(lsame)(const char *, const char *); | |||
extern int LAPACK(xerbla)(const char *, const int *); | |||
extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); | |||
extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); | |||
extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); | |||
extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); | |||
extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); | |||
extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); | |||
extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); | |||
extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); | |||
extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); | |||
extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); | |||
extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); | |||
extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); | |||
extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); | |||
extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); | |||
extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); | |||
extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); | |||
extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *); | |||
extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *); | |||
extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *); | |||
extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *); | |||
extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *); | |||
extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *); | |||
extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *); | |||
extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *); | |||
extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *); | |||
extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *); | |||
extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *); | |||
extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *); | |||
extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *); | |||
extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *); | |||
extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *); | |||
extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *); | |||
extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *); | |||
extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *); | |||
extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *); | |||
extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *); | |||
#endif /* LAPACK_H */ |
@@ -0,0 +1,607 @@ | |||
#include "relapack.h" | |||
//////////// | |||
// XLAUUM // | |||
//////////// | |||
#if INCLUDE_SLAUUM | |||
void LAPACK(slauum)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_slauum(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_DLAUUM | |||
void LAPACK(dlauum)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_dlauum(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_CLAUUM | |||
void LAPACK(clauum)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_clauum(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZLAUUM | |||
void LAPACK(zlauum)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_zlauum(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
//////////// | |||
// XSYGST // | |||
//////////// | |||
#if INCLUDE_SSYGST | |||
void LAPACK(ssygst)( | |||
const int *itype, const char *uplo, const int *n, | |||
float *A, const int *ldA, const float *B, const int *ldB, | |||
int *info | |||
) { | |||
RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
} | |||
#endif | |||
#if INCLUDE_DSYGST | |||
void LAPACK(dsygst)( | |||
const int *itype, const char *uplo, const int *n, | |||
double *A, const int *ldA, const double *B, const int *ldB, | |||
int *info | |||
) { | |||
RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
} | |||
#endif | |||
#if INCLUDE_CHEGST | |||
void LAPACK(chegst)( | |||
const int *itype, const char *uplo, const int *n, | |||
float *A, const int *ldA, const float *B, const int *ldB, | |||
int *info | |||
) { | |||
RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZHEGST | |||
void LAPACK(zhegst)( | |||
const int *itype, const char *uplo, const int *n, | |||
double *A, const int *ldA, const double *B, const int *ldB, | |||
int *info | |||
) { | |||
RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info); | |||
} | |||
#endif | |||
//////////// | |||
// XTRTRI // | |||
//////////// | |||
#if INCLUDE_STRTRI | |||
void LAPACK(strtri)( | |||
const char *uplo, const char *diag, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_strtri(uplo, diag, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_DTRTRI | |||
void LAPACK(dtrtri)( | |||
const char *uplo, const char *diag, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_CTRTRI | |||
void LAPACK(ctrtri)( | |||
const char *uplo, const char *diag, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZTRTRI | |||
void LAPACK(ztrtri)( | |||
const char *uplo, const char *diag, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); | |||
} | |||
#endif | |||
//////////// | |||
// XPOTRF // | |||
//////////// | |||
#if INCLUDE_SPOTRF | |||
void LAPACK(spotrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_spotrf(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_DPOTRF | |||
void LAPACK(dpotrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_dpotrf(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_CPOTRF | |||
void LAPACK(cpotrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_cpotrf(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZPOTRF | |||
void LAPACK(zpotrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_zpotrf(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
//////////// | |||
// XPBTRF // | |||
//////////// | |||
#if INCLUDE_SPBTRF | |||
void LAPACK(spbtrf)( | |||
const char *uplo, const int *n, const int *kd, | |||
float *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); | |||
} | |||
#endif | |||
#if INCLUDE_DPBTRF | |||
void LAPACK(dpbtrf)( | |||
const char *uplo, const int *n, const int *kd, | |||
double *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
} | |||
#endif | |||
#if INCLUDE_CPBTRF | |||
void LAPACK(cpbtrf)( | |||
const char *uplo, const int *n, const int *kd, | |||
float *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZPBTRF | |||
void LAPACK(zpbtrf)( | |||
const char *uplo, const int *n, const int *kd, | |||
double *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
} | |||
#endif | |||
//////////// | |||
// XSYTRF // | |||
//////////// | |||
#if INCLUDE_SSYTRF | |||
void LAPACK(ssytrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_DSYTRF | |||
void LAPACK(dsytrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CSYTRF | |||
void LAPACK(csytrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZSYTRF | |||
void LAPACK(zsytrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CHETRF | |||
void LAPACK(chetrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZHETRF | |||
void LAPACK(zhetrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_SSYTRF_ROOK | |||
void LAPACK(ssytrf_rook)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_DSYTRF_ROOK | |||
void LAPACK(dsytrf_rook)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CSYTRF_ROOK | |||
void LAPACK(csytrf_rook)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZSYTRF_ROOK | |||
void LAPACK(zsytrf_rook)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CHETRF_ROOK | |||
void LAPACK(chetrf_rook)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZHETRF_ROOK | |||
void LAPACK(zhetrf_rook)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
//////////// | |||
// XGETRF // | |||
//////////// | |||
#if INCLUDE_SGETRF | |||
void LAPACK(sgetrf)( | |||
const int *m, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_DGETRF | |||
void LAPACK(dgetrf)( | |||
const int *m, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_CGETRF | |||
void LAPACK(cgetrf)( | |||
const int *m, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZGETRF | |||
void LAPACK(zgetrf)( | |||
const int *m, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); | |||
} | |||
#endif | |||
//////////// | |||
// XGBTRF // | |||
//////////// | |||
#if INCLUDE_SGBTRF | |||
void LAPACK(sgbtrf)( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
float *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_DGBTRF | |||
void LAPACK(dgbtrf)( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
double *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_CGBTRF | |||
void LAPACK(cgbtrf)( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
float *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZGBTRF | |||
void LAPACK(zgbtrf)( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
double *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
} | |||
#endif | |||
//////////// | |||
// XTRSYL // | |||
//////////// | |||
#if INCLUDE_STRSYL | |||
void LAPACK(strsyl)( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, float *scale, | |||
int *info | |||
) { | |||
RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
#endif | |||
#if INCLUDE_DTRSYL | |||
void LAPACK(dtrsyl)( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, double *scale, | |||
int *info | |||
) { | |||
RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
#endif | |||
#if INCLUDE_CTRSYL | |||
void LAPACK(ctrsyl)( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, float *scale, | |||
int *info | |||
) { | |||
RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZTRSYL | |||
void LAPACK(ztrsyl)( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, double *scale, | |||
int *info | |||
) { | |||
RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
#endif | |||
//////////// | |||
// XTGSYL // | |||
//////////// | |||
#if INCLUDE_STGSYL | |||
void LAPACK(stgsyl)( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, | |||
const float *D, const int *ldD, const float *E, const int *ldE, | |||
float *F, const int *ldF, | |||
float *scale, float *dif, | |||
float *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_DTGSYL | |||
void LAPACK(dtgsyl)( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, | |||
const double *D, const int *ldD, const double *E, const int *ldE, | |||
double *F, const int *ldF, | |||
double *scale, double *dif, | |||
double *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CTGSYL | |||
void LAPACK(ctgsyl)( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, | |||
const float *D, const int *ldD, const float *E, const int *ldE, | |||
float *F, const int *ldF, | |||
float *scale, float *dif, | |||
float *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZTGSYL | |||
void LAPACK(ztgsyl)( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, | |||
const double *D, const int *ldD, const double *E, const int *ldE, | |||
double *F, const int *ldF, | |||
double *scale, double *dif, | |||
double *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
} | |||
#endif | |||
//////////// | |||
// XGEMMT // | |||
//////////// | |||
#if INCLUDE_SGEMMT | |||
void LAPACK(sgemmt)( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
RELAPACK_sgemmt(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_DGEMMT | |||
void LAPACK(dgemmt)( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
RELAPACK_dgemmt(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_CGEMMT | |||
void LAPACK(cgemmt)( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
RELAPACK_cgemmt(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZGEMMT | |||
void LAPACK(zgemmt)( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
RELAPACK_zgemmt(uplo, n, A, ldA, info); | |||
} | |||
#endif |
@@ -0,0 +1,607 @@ | |||
#include "relapack.h" | |||
//////////// | |||
// XLAUUM // | |||
//////////// | |||
#if INCLUDE_SLAUUM | |||
void LAPACK(slauum)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_slauum(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_DLAUUM | |||
void LAPACK(dlauum)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_dlauum(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_CLAUUM | |||
void LAPACK(clauum)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_clauum(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZLAUUM | |||
void LAPACK(zlauum)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_zlauum(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
//////////// | |||
// XSYGST // | |||
//////////// | |||
#if INCLUDE_SSYGST | |||
void LAPACK(ssygst)( | |||
const int *itype, const char *uplo, const int *n, | |||
float *A, const int *ldA, const float *B, const int *ldB, | |||
int *info | |||
) { | |||
RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
} | |||
#endif | |||
#if INCLUDE_DSYGST | |||
void LAPACK(dsygst)( | |||
const int *itype, const char *uplo, const int *n, | |||
double *A, const int *ldA, const double *B, const int *ldB, | |||
int *info | |||
) { | |||
RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
} | |||
#endif | |||
#if INCLUDE_CSYGST | |||
void LAPACK(csygst)( | |||
const int *itype, const char *uplo, const int *n, | |||
float *A, const int *ldA, const float *B, const int *ldB, | |||
int *info | |||
) { | |||
RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZSYGST | |||
void LAPACK(zsygst)( | |||
const int *itype, const char *uplo, const int *n, | |||
double *A, const int *ldA, const double *B, const int *ldB, | |||
int *info | |||
) { | |||
RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
} | |||
#endif | |||
//////////// | |||
// XTRTRI // | |||
//////////// | |||
#if INCLUDE_STRTRI | |||
void LAPACK(strtri)( | |||
const char *uplo, const char *diag, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_strtri(uplo, diag, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_DTRTRI | |||
void LAPACK(dtrtri)( | |||
const char *uplo, const char *diag, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_CTRTRI | |||
void LAPACK(ctrtri)( | |||
const char *uplo, const char *diag, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZTRTRI | |||
void LAPACK(ztrtri)( | |||
const char *uplo, const char *diag, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); | |||
} | |||
#endif | |||
//////////// | |||
// XPOTRF // | |||
//////////// | |||
#if INCLUDE_SPOTRF | |||
void LAPACK(spotrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_spotrf(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_DPOTRF | |||
void LAPACK(dpotrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_dpotrf(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_CPOTRF | |||
void LAPACK(cpotrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_cpotrf(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZPOTRF | |||
void LAPACK(zpotrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
RELAPACK_zpotrf(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
//////////// | |||
// XPBTRF // | |||
//////////// | |||
#if INCLUDE_SPBTRF | |||
void LAPACK(spbtrf)( | |||
const char *uplo, const int *n, const int *kd, | |||
float *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); | |||
} | |||
#endif | |||
#if INCLUDE_DPBTRF | |||
void LAPACK(dpbtrf)( | |||
const char *uplo, const int *n, const int *kd, | |||
double *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
} | |||
#endif | |||
#if INCLUDE_CPBTRF | |||
void LAPACK(cpbtrf)( | |||
const char *uplo, const int *n, const int *kd, | |||
float *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZPBTRF | |||
void LAPACK(zpbtrf)( | |||
const char *uplo, const int *n, const int *kd, | |||
double *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
} | |||
#endif | |||
//////////// | |||
// XSYTRF // | |||
//////////// | |||
#if INCLUDE_SSYTRF | |||
void LAPACK(ssytrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_DSYTRF | |||
void LAPACK(dsytrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CSYTRF | |||
void LAPACK(csytrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZSYTRF | |||
void LAPACK(zsytrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CHETRF | |||
void LAPACK(chetrf)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZHETRF | |||
void LAPACK(zhetrf)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_SSYTRF_ROOK | |||
void LAPACK(ssytrf_rook)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_DSYTRF_ROOK | |||
void LAPACK(dsytrf_rook)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CSYTRF_ROOK | |||
void LAPACK(csytrf_rook)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZSYTRF_ROOK | |||
void LAPACK(zsytrf_rook)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CHETRF_ROOK | |||
void LAPACK(chetrf_rook)( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZHETRF_ROOK | |||
void LAPACK(zhetrf_rook)( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
} | |||
#endif | |||
//////////// | |||
// XGETRF // | |||
//////////// | |||
#if INCLUDE_SGETRF | |||
void LAPACK(sgetrf)( | |||
const int *m, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_DGETRF | |||
void LAPACK(dgetrf)( | |||
const int *m, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_CGETRF | |||
void LAPACK(cgetrf)( | |||
const int *m, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZGETRF | |||
void LAPACK(zgetrf)( | |||
const int *m, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); | |||
} | |||
#endif | |||
//////////// | |||
// XGBTRF // | |||
//////////// | |||
#if INCLUDE_SGBTRF | |||
void LAPACK(sgbtrf)( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
float *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_DGBTRF | |||
void LAPACK(dgbtrf)( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
double *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_CGBTRF | |||
void LAPACK(cgbtrf)( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
float *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZGBTRF | |||
void LAPACK(zgbtrf)( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
double *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
} | |||
#endif | |||
//////////// | |||
// XTRSYL // | |||
//////////// | |||
#if INCLUDE_STRSYL | |||
void LAPACK(strsyl)( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, float *scale, | |||
int *info | |||
) { | |||
RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
#endif | |||
#if INCLUDE_DTRSYL | |||
void LAPACK(dtrsyl)( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, double *scale, | |||
int *info | |||
) { | |||
RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
#endif | |||
#if INCLUDE_CTRSYL | |||
void LAPACK(ctrsyl)( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, float *scale, | |||
int *info | |||
) { | |||
RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZTRSYL | |||
void LAPACK(ztrsyl)( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, double *scale, | |||
int *info | |||
) { | |||
RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
#endif | |||
//////////// | |||
// XTGSYL // | |||
//////////// | |||
#if INCLUDE_STGSYL | |||
void LAPACK(stgsyl)( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, | |||
const float *D, const int *ldD, const float *E, const int *ldE, | |||
float *F, const int *ldF, | |||
float *scale, float *dif, | |||
float *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_DTGSYL | |||
void LAPACK(dtgsyl)( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, | |||
const double *D, const int *ldD, const double *E, const int *ldE, | |||
double *F, const int *ldF, | |||
double *scale, double *dif, | |||
double *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_CTGSYL | |||
void LAPACK(ctgsyl)( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, | |||
const float *D, const int *ldD, const float *E, const int *ldE, | |||
float *F, const int *ldF, | |||
float *scale, float *dif, | |||
float *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZTGSYL | |||
void LAPACK(ztgsyl)( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, | |||
const double *D, const int *ldD, const double *E, const int *ldE, | |||
double *F, const int *ldF, | |||
double *scale, double *dif, | |||
double *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
} | |||
#endif | |||
//////////// | |||
// XGEMMT // | |||
//////////// | |||
#if INCLUDE_SGEMMT | |||
void LAPACK(sgemmt)( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
RELAPACK_sgemmt(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_DGEMMT | |||
void LAPACK(dgemmt)( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
RELAPACK_dgemmt(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_CGEMMT | |||
void LAPACK(cgemmt)( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
RELAPACK_cgemmt(uplo, n, A, ldA, info); | |||
} | |||
#endif | |||
#if INCLUDE_ZGEMMT | |||
void LAPACK(zgemmt)( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
RELAPACK_zgemmt(uplo, n, A, ldA, info); | |||
} | |||
#endif |
@@ -0,0 +1,60 @@ | |||
#ifndef RELAPACK_INT_H | |||
#define RELAPACK_INT_H | |||
#include "../config.h" | |||
#include "../inc/relapack.h" | |||
// add an underscore to BLAS routines (or not) | |||
#if BLAS_UNDERSCORE | |||
#define BLAS(routine) routine ## _ | |||
#else | |||
#define BLAS(routine) routine | |||
#endif | |||
// add an underscore to LAPACK routines (or not) | |||
#if LAPACK_UNDERSCORE | |||
#define LAPACK(routine) routine ## _ | |||
#else | |||
#define LAPACK(routine) routine | |||
#endif | |||
// minimum and maximum macros | |||
#define MAX(a, b) ((a) > (b) ? (a) : (b)) | |||
#define MIN(a, b) ((a) < (b) ? (a) : (b)) | |||
// REC_SPLIT(n) returns how a problem of size n is split recursively. | |||
// If n >= 16, we ensure that the size of at least one of the halves is | |||
// divisible by 8 (the cache line size in most CPUs), while both halves are | |||
// still as close as possible in size. | |||
// If n < 16 the problem is simply split in the middle. (Note that the | |||
// crossoversize is usually larger than 16.) | |||
#define SREC_SPLIT(n) ((n >= 32) ? ((n + 16) / 32) * 16 : n / 2) | |||
#define DREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2) | |||
#define CREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2) | |||
#define ZREC_SPLIT(n) ((n >= 8) ? ((n + 4) / 8) * 4 : n / 2) | |||
#include "lapack.h" | |||
#include "blas.h" | |||
// sytrf helper routines | |||
void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
// trsyl helper routines | |||
void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
#endif /* RELAPACK_INT_H */ |
@@ -0,0 +1,227 @@ | |||
#include "relapack.h" | |||
#include "stdlib.h" | |||
static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *, | |||
const int *, float *, const int *, int *, float *, const int *, float *, | |||
const int *, int *); | |||
/** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. | |||
* | |||
* This routine is functionally equivalent to LAPACK's sgbtrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html | |||
* */ | |||
void RELAPACK_sgbtrf( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
float *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
// Check arguments | |||
*info = 0; | |||
if (*m < 0) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*kl < 0) | |||
*info = -3; | |||
else if (*ku < 0) | |||
*info = -4; | |||
else if (*ldAb < 2 * *kl + *ku + 1) | |||
*info = -6; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("SGBTRF", &minfo); | |||
return; | |||
} | |||
// Constant | |||
const float ZERO[] = { 0. }; | |||
// Result upper band width | |||
const int kv = *ku + *kl; | |||
// Unskewg A | |||
const int ldA[] = { *ldAb - 1 }; | |||
float *const A = Ab + kv; | |||
// Zero upper diagonal fill-in elements | |||
int i, j; | |||
for (j = 0; j < *n; j++) { | |||
float *const A_j = A + *ldA * j; | |||
for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
A_j[i] = 0.; | |||
} | |||
// Allocate work space | |||
const int n1 = SREC_SPLIT(*n); | |||
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
const int nWorkl = (kv > n1) ? n1 : kv; | |||
const int mWorku = (*kl > n1) ? n1 : *kl; | |||
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); | |||
float *Worku = malloc(mWorku * nWorku * sizeof(float)); | |||
LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
// Recursive kernel | |||
RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
// Free work space | |||
free(Workl); | |||
free(Worku); | |||
} | |||
/** sgbtrf's recursive compute kernel */ | |||
static void RELAPACK_sgbtrf_rec( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
float *Ab, const int *ldAb, int *ipiv, | |||
float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { | |||
// Unblocked | |||
LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterators | |||
int i, j; | |||
// Output upper band width | |||
const int kv = *ku + *kl; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
float *const A = Ab + kv; | |||
// Splitting | |||
const int n1 = MIN(SREC_SPLIT(*n), *kl); | |||
const int n2 = *n - n1; | |||
const int m1 = MIN(n1, *m); | |||
const int m2 = *m - m1; | |||
const int mn1 = MIN(m1, n1); | |||
const int mn2 = MIN(m2, n2); | |||
// Ab_L * | |||
// Ab_BR | |||
float *const Ab_L = Ab; | |||
float *const Ab_BR = Ab + *ldAb * n1; | |||
// A_L A_R | |||
float *const A_L = A; | |||
float *const A_R = A + *ldA * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + *ldA * n1; | |||
float *const A_BL = A + m1; | |||
float *const A_BR = A + *ldA * n1 + m1; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_T = ipiv; | |||
int *const ipiv_B = ipiv + n1; | |||
// Banded splitting | |||
const int n21 = MIN(n2, kv - n1); | |||
const int n22 = MIN(n2 - n21, n1); | |||
const int m21 = MIN(m2, *kl - m1); | |||
const int m22 = MIN(m2 - m21, m1); | |||
// n1 n21 n22 | |||
// m * A_Rl ARr | |||
float *const A_Rl = A_R; | |||
float *const A_Rr = A_R + *ldA * n21; | |||
// n1 n21 n22 | |||
// m1 * A_TRl A_TRr | |||
// m21 A_BLt A_BRtl A_BRtr | |||
// m22 A_BLb A_BRbl A_BRbr | |||
float *const A_TRl = A_TR; | |||
float *const A_TRr = A_TR + *ldA * n21; | |||
float *const A_BLt = A_BL; | |||
float *const A_BLb = A_BL + m21; | |||
float *const A_BRtl = A_BR; | |||
float *const A_BRtr = A_BR + *ldA * n21; | |||
float *const A_BRbl = A_BR + m21; | |||
float *const A_BRbr = A_BR + *ldA * n21 + m21; | |||
// recursion(Ab_L, ipiv_T) | |||
RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
// Workl = A_BLb | |||
LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
// partially redo swaps in A_L | |||
for (i = 0; i < mn1; i++) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
if (ip < *kl) | |||
BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
else | |||
BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
} | |||
} | |||
// apply pivots to A_Rl | |||
LAPACK(slaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
// apply pivots to A_Rr columnwise | |||
for (j = 0; j < n22; j++) { | |||
float *const A_Rrj = A_Rr + *ldA * j; | |||
for (i = j; i < mn1; i++) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
const float tmp = A_Rrj[i]; | |||
A_Rrj[i] = A_Rr[ip]; | |||
A_Rrj[ip] = tmp; | |||
} | |||
} | |||
} | |||
// A_TRl = A_TL \ A_TRl | |||
BLAS(strsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
// Worku = A_TRr | |||
LAPACK(slacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
// Worku = A_TL \ Worku | |||
BLAS(strsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
// A_TRr = Worku | |||
LAPACK(slacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
// A_BRtl = A_BRtl - A_BLt * A_TRl | |||
BLAS(sgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
// A_BRbl = A_BRbl - Workl * A_TRl | |||
BLAS(sgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
// A_BRtr = A_BRtr - A_BLt * Worku | |||
BLAS(sgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
// A_BRbr = A_BRbr - Workl * Worku | |||
BLAS(sgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
// partially undo swaps in A_L | |||
for (i = mn1 - 1; i >= 0; i--) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
if (ip < *kl) | |||
BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
else | |||
BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
} | |||
} | |||
// recursion(Ab_BR, ipiv_B) | |||
RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
if (*info) | |||
*info += n1; | |||
// shift pivots | |||
for (i = 0; i < mn2; i++) | |||
ipiv_B[i] += n1; | |||
} |
@@ -0,0 +1,165 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_sgemmt_rec(const char *, const char *, const char *, | |||
const int *, const int *, const float *, const float *, const int *, | |||
const float *, const int *, const float *, float *, const int *); | |||
static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, | |||
const int *, const int *, const float *, const float *, const int *, | |||
const float *, const int *, const float *, float *, const int *); | |||
/** SGEMMT computes a matrix-matrix product with general matrices but updates | |||
* only the upper or lower triangular part of the result matrix. | |||
* | |||
* This routine performs the same operation as the BLAS routine | |||
* sgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
* but only updates the triangular part of C specified by uplo: | |||
* If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
* otherwise the upper triangular part is updated. | |||
* */ | |||
void RELAPACK_sgemmt( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
#if HAVE_XGEMMT | |||
BLAS(sgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
return; | |||
#else | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
const int notransA = LAPACK(lsame)(transA, "N"); | |||
const int tranA = LAPACK(lsame)(transA, "T"); | |||
const int notransB = LAPACK(lsame)(transB, "N"); | |||
const int tranB = LAPACK(lsame)(transB, "T"); | |||
int info = 0; | |||
if (!lower && !upper) | |||
info = 1; | |||
else if (!tranA && !notransA) | |||
info = 2; | |||
else if (!tranB && !notransB) | |||
info = 3; | |||
else if (*n < 0) | |||
info = 4; | |||
else if (*k < 0) | |||
info = 5; | |||
else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
info = 8; | |||
else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
info = 10; | |||
else if (*ldC < MAX(1, *n)) | |||
info = 13; | |||
if (info) { | |||
LAPACK(xerbla)("SGEMMT", &info); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
const char cleantransA = notransA ? 'N' : 'T'; | |||
const char cleantransB = notransB ? 'N' : 'T'; | |||
// Recursive kernel | |||
RELAPACK_sgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
#endif | |||
} | |||
/** sgemmt's recursive compute kernel */ | |||
static void RELAPACK_sgemmt_rec( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
if (*n <= MAX(CROSSOVER_SGEMMT, 1)) { | |||
// Unblocked | |||
RELAPACK_sgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
return; | |||
} | |||
// Splitting | |||
const int n1 = SREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_T | |||
// A_B | |||
const float *const A_T = A; | |||
const float *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1); | |||
// B_L B_R | |||
const float *const B_L = B; | |||
const float *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1); | |||
// C_TL C_TR | |||
// C_BL C_BR | |||
float *const C_TL = C; | |||
float *const C_TR = C + *ldC * n1; | |||
float *const C_BL = C + n1; | |||
float *const C_BR = C + *ldC * n1 + n1; | |||
// recursion(C_TL) | |||
RELAPACK_sgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
if (*uplo == 'L') | |||
// C_BL = alpha A_B B_L + beta C_BL | |||
BLAS(sgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
else | |||
// C_TR = alpha A_T B_R + beta C_TR | |||
BLAS(sgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
// recursion(C_BR) | |||
RELAPACK_sgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
} | |||
/** sgemmt's unblocked compute kernel */ | |||
static void RELAPACK_sgemmt_rec2( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const float *alpha, const float *A, const int *ldA, | |||
const float *B, const int *ldB, | |||
const float *beta, float *C, const int *ldC | |||
) { | |||
const int incB = (*transB == 'N') ? 1 : *ldB; | |||
const int incC = 1; | |||
int i; | |||
for (i = 0; i < *n; i++) { | |||
// A_0 | |||
// A_i | |||
const float *const A_0 = A; | |||
const float *const A_i = A + ((*transA == 'N') ? i : *ldA * i); | |||
// * B_i * | |||
const float *const B_i = B + ((*transB == 'N') ? *ldB * i : i); | |||
// * C_0i * | |||
// * C_ii * | |||
float *const C_0i = C + *ldC * i; | |||
float *const C_ii = C + *ldC * i + i; | |||
if (*uplo == 'L') { | |||
const int nmi = *n - i; | |||
if (*transA == 'N') | |||
BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
else | |||
BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
} else { | |||
const int ip1 = i + 1; | |||
if (*transA == 'N') | |||
BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
else | |||
BLAS(sgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
} | |||
} | |||
} |
@@ -0,0 +1,117 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *, | |||
int *, int *); | |||
/** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
* | |||
* This routine is functionally equivalent to LAPACK's sgetrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html | |||
* */ | |||
void RELAPACK_sgetrf( | |||
const int *m, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
// Check arguments | |||
*info = 0; | |||
if (*m < 0) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("SGETRF", &minfo); | |||
return; | |||
} | |||
const int sn = MIN(*m, *n); | |||
RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
// Right remainder | |||
if (*m < *n) { | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const int iONE[] = { 1. }; | |||
// Splitting | |||
const int rn = *n - *m; | |||
// A_L A_R | |||
const float *const A_L = A; | |||
float *const A_R = A + *ldA * *m; | |||
// A_R = apply(ipiv, A_R) | |||
LAPACK(slaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
// A_R = A_L \ A_R | |||
BLAS(strsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
} | |||
} | |||
/** sgetrf's recursive compute kernel */ | |||
static void RELAPACK_sgetrf_rec( | |||
const int *m, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_SGETRF, 1)) { | |||
// Unblocked | |||
LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
// Splitting | |||
const int n1 = SREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
const int m2 = *m - n1; | |||
// A_L A_R | |||
float *const A_L = A; | |||
float *const A_R = A + *ldA * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + *ldA * n1; | |||
float *const A_BL = A + n1; | |||
float *const A_BR = A + *ldA * n1 + n1; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_T = ipiv; | |||
int *const ipiv_B = ipiv + n1; | |||
// recursion(A_L, ipiv_T) | |||
RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
// apply pivots to A_R | |||
LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
// A_TR = A_TL \ A_TR | |||
BLAS(strsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
// A_BR = A_BR - A_BL * A_TR | |||
BLAS(sgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
// recursion(A_BR, ipiv_B) | |||
RELAPACK_sgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
if (*info) | |||
*info += n1; | |||
// apply pivots to A_BL | |||
LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
ipiv_B[i] += n1; | |||
} |
@@ -0,0 +1,87 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_slauum_rec(const char *, const int *, float *, | |||
const int *, int *); | |||
/** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's slauum. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html | |||
* */ | |||
void RELAPACK_slauum( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("SLAUUM", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Recursive kernel | |||
RELAPACK_slauum_rec(&cleanuplo, n, A, ldA, info); | |||
} | |||
/** slauum's recursive compute kernel */ | |||
static void RELAPACK_slauum_rec( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_SLAUUM, 1)) { | |||
// Unblocked | |||
LAPACK(slauu2)(uplo, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
// Splitting | |||
const int n1 = SREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + *ldA * n1; | |||
float *const A_BL = A + n1; | |||
float *const A_BR = A + *ldA * n1 + n1; | |||
// recursion(A_TL) | |||
RELAPACK_slauum_rec(uplo, &n1, A_TL, ldA, info); | |||
if (*uplo == 'L') { | |||
// A_TL = A_TL + A_BL' * A_BL | |||
BLAS(ssyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
// A_BL = A_BR' * A_BL | |||
BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
} else { | |||
// A_TL = A_TL + A_TR * A_TR' | |||
BLAS(ssyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
// A_TR = A_TR * A_BR' | |||
BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_slauum_rec(uplo, &n2, A_BR, ldA, info); | |||
} |
@@ -0,0 +1,157 @@ | |||
#include "relapack.h" | |||
#include "stdlib.h" | |||
static void RELAPACK_spbtrf_rec(const char *, const int *, const int *, | |||
float *, const int *, float *, const int *, int *); | |||
/** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's spbtrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html | |||
* */ | |||
void RELAPACK_spbtrf( | |||
const char *uplo, const int *n, const int *kd, | |||
float *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*kd < 0) | |||
*info = -3; | |||
else if (*ldAb < *kd + 1) | |||
*info = -5; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("SPBTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Constant | |||
const float ZERO[] = { 0. }; | |||
// Allocate work space | |||
const int n1 = SREC_SPLIT(*n); | |||
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
float *Work = malloc(mWork * nWork * sizeof(float)); | |||
LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
// Recursive kernel | |||
RELAPACK_spbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
// Free work space | |||
free(Work); | |||
} | |||
/** spbtrf's recursive compute kernel */ | |||
static void RELAPACK_spbtrf_rec( | |||
const char *uplo, const int *n, const int *kd, | |||
float *Ab, const int *ldAb, | |||
float *Work, const int *ldWork, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_SPBTRF, 1)) { | |||
// Unblocked | |||
LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
float *const A = Ab + ((*uplo == 'L') ? 0 : *kd); | |||
// Splitting | |||
const int n1 = MIN(SREC_SPLIT(*n), *kd); | |||
const int n2 = *n - n1; | |||
// * * | |||
// * Ab_BR | |||
float *const Ab_BR = Ab + *ldAb * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + *ldA * n1; | |||
float *const A_BL = A + n1; | |||
float *const A_BR = A + *ldA * n1 + n1; | |||
// recursion(A_TL) | |||
RELAPACK_spotrf(uplo, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
// Banded splitting | |||
const int n21 = MIN(n2, *kd - n1); | |||
const int n22 = MIN(n2 - n21, *kd); | |||
// n1 n21 n22 | |||
// n1 * A_TRl A_TRr | |||
// n21 A_BLt A_BRtl A_BRtr | |||
// n22 A_BLb A_BRbl A_BRbr | |||
float *const A_TRl = A_TR; | |||
float *const A_TRr = A_TR + *ldA * n21; | |||
float *const A_BLt = A_BL; | |||
float *const A_BLb = A_BL + n21; | |||
float *const A_BRtl = A_BR; | |||
float *const A_BRtr = A_BR + *ldA * n21; | |||
float *const A_BRbl = A_BR + n21; | |||
float *const A_BRbr = A_BR + *ldA * n21 + n21; | |||
if (*uplo == 'L') { | |||
// A_BLt = ABLt / A_TL' | |||
BLAS(strsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
// A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
BLAS(ssyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
// Work = A_BLb | |||
LAPACK(slacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
// Work = Work / A_TL' | |||
BLAS(strsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
// A_BRbl = A_BRbl - Work * A_BLt' | |||
BLAS(sgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
// A_BRbr = A_BRbr - Work * Work' | |||
BLAS(ssyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
// A_BLb = Work | |||
LAPACK(slacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
} else { | |||
// A_TRl = A_TL' \ A_TRl | |||
BLAS(strsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
// A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
BLAS(ssyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
// Work = A_TRr | |||
LAPACK(slacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
// Work = A_TL' \ Work | |||
BLAS(strsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
// A_BRtr = A_BRtr - A_TRl' * Work | |||
BLAS(sgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
// A_BRbr = A_BRbr - Work' * Work | |||
BLAS(ssyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
// A_TRr = Work | |||
LAPACK(slacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
} | |||
// recursion(A_BR) | |||
if (*kd > n1) | |||
RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info); | |||
else | |||
RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,92 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_spotrf_rec(const char *, const int *, float *, | |||
const int *, int *); | |||
/** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's spotrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html | |||
* */ | |||
void RELAPACK_spotrf( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("SPOTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Recursive kernel | |||
RELAPACK_spotrf_rec(&cleanuplo, n, A, ldA, info); | |||
} | |||
/** spotrf's recursive compute kernel */ | |||
static void RELAPACK_spotrf_rec( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_SPOTRF, 1)) { | |||
// Unblocked | |||
LAPACK(spotf2)(uplo, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
// Splitting | |||
const int n1 = SREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + *ldA * n1; | |||
float *const A_BL = A + n1; | |||
float *const A_BR = A + *ldA * n1 + n1; | |||
// recursion(A_TL) | |||
RELAPACK_spotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL / A_TL' | |||
BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
// A_BR = A_BR - A_BL * A_BL' | |||
BLAS(ssyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
} else { | |||
// A_TR = A_TL' \ A_TR | |||
BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
// A_BR = A_BR - A_TR' * A_TR | |||
BLAS(ssyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_spotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,212 @@ | |||
#include "relapack.h" | |||
#if XSYGST_ALLOW_MALLOC | |||
#include "stdlib.h" | |||
#endif | |||
static void RELAPACK_ssygst_rec(const int *, const char *, const int *, | |||
float *, const int *, const float *, const int *, | |||
float *, const int *, int *); | |||
/** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. | |||
* | |||
* This routine is functionally equivalent to LAPACK's ssygst. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html | |||
* */ | |||
void RELAPACK_ssygst( | |||
const int *itype, const char *uplo, const int *n, | |||
float *A, const int *ldA, const float *B, const int *ldB, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (*itype < 1 || *itype > 3) | |||
*info = -1; | |||
else if (!lower && !upper) | |||
*info = -2; | |||
else if (*n < 0) | |||
*info = -3; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -5; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -7; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("SSYGST", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Allocate work space | |||
float *Work = NULL; | |||
int lWork = 0; | |||
#if XSYGST_ALLOW_MALLOC | |||
const int n1 = SREC_SPLIT(*n); | |||
lWork = n1 * (*n - n1); | |||
Work = malloc(lWork * sizeof(float)); | |||
if (!Work) | |||
lWork = 0; | |||
#endif | |||
// Recursive kernel | |||
RELAPACK_ssygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); | |||
// Free work space | |||
#if XSYGST_ALLOW_MALLOC | |||
if (Work) | |||
free(Work); | |||
#endif | |||
} | |||
/** ssygst's recursive compute kernel */ | |||
static void RELAPACK_ssygst_rec( | |||
const int *itype, const char *uplo, const int *n, | |||
float *A, const int *ldA, const float *B, const int *ldB, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_SSYGST, 1)) { | |||
// Unblocked | |||
LAPACK(ssygs2)(itype, uplo, n, A, ldA, B, ldB, info); | |||
return; | |||
} | |||
// Constants | |||
const float ZERO[] = { 0. }; | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
const float HALF[] = { .5 }; | |||
const float MHALF[] = { -.5 }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterator | |||
int i; | |||
// Splitting | |||
const int n1 = SREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + *ldA * n1; | |||
float *const A_BL = A + n1; | |||
float *const A_BR = A + *ldA * n1 + n1; | |||
// B_TL B_TR | |||
// B_BL B_BR | |||
const float *const B_TL = B; | |||
const float *const B_TR = B + *ldB * n1; | |||
const float *const B_BL = B + n1; | |||
const float *const B_BR = B + *ldB * n1 + n1; | |||
// recursion(A_TL, B_TL) | |||
RELAPACK_ssygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); | |||
if (*itype == 1) | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL / B_TL' | |||
BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = -1/2 * B_BL * A_TL | |||
BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
} else | |||
// A_BL = A_BL - 1/2 B_BL * A_TL | |||
BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' | |||
BLAS(ssyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
else | |||
// A_BL = A_BL - 1/2 B_BL * A_TL | |||
BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BL = B_BR \ A_BL | |||
BLAS(strsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
} else { | |||
// A_TR = B_TL' \ A_TR | |||
BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = -1/2 * A_TL * B_TR | |||
BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
// A_TR = A_BL + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
} else | |||
// A_TR = A_TR - 1/2 A_TL * B_TR | |||
BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR | |||
BLAS(ssyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_TR = A_BL + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
else | |||
// A_TR = A_TR - 1/2 A_TL * B_TR | |||
BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TR = A_TR / B_BR | |||
BLAS(strsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
} | |||
else | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL * B_TL | |||
BLAS(strmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = 1/2 * A_BR * B_BL | |||
BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
} else | |||
// A_BL = A_BL + 1/2 A_BR * B_BL | |||
BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL | |||
BLAS(ssyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
else | |||
// A_BL = A_BL + 1/2 A_BR * B_BL | |||
BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BL = B_BR * A_BL | |||
BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
} else { | |||
// A_TR = B_TL * A_TR | |||
BLAS(strmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
if (*lWork > n2 * n1) { | |||
// T = 1/2 * B_TR * A_BR | |||
BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
// A_TR = A_TR + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
} else | |||
// A_TR = A_TR + 1/2 B_TR A_BR | |||
BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' | |||
BLAS(ssyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); | |||
if (*lWork > n2 * n1) | |||
// A_TR = A_TR + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
else | |||
// A_TR = A_TR + 1/2 B_TR * A_BR | |||
BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TR = A_TR * B_BR | |||
BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
} | |||
// recursion(A_BR, B_BR) | |||
RELAPACK_ssygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); | |||
} |
@@ -0,0 +1,238 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *, | |||
float *, const int *, int *, float *, const int *, int *); | |||
/** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's ssytrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html | |||
* */ | |||
void RELAPACK_ssytrf( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
float *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * sizeof(float)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("SSYTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy arguments | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** ssytrf's recursive compute kernel */ | |||
static void RELAPACK_ssytrf_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_SSYTRF, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_ssytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterator | |||
int i; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = SREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
float *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
float *const A_BL = A + n1; | |||
float *const A_BR = A + *ldA * n1 + n1; | |||
float *const A_BL_B = A + *n; | |||
float *const A_BR_B = A + *ldA * n1 + *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
float *const Work_BL = Work + n1; | |||
float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; | |||
// last row of A_BL | |||
float *const A_BL_b = A_BL + n2_out; | |||
// last row of Work_BL | |||
float *const Work_BL_b = Work_BL + n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = SREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
float *const Work_R = top ? Work : Work + *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
float *const A_TL_T = A + *ldA * n_rest; | |||
float *const A_TR_T = A + *ldA * (n_rest + n1); | |||
float *const A_TL = A + *ldA * n_rest + n_rest; | |||
float *const A_TR = A + *ldA * (n_rest + n1) + n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
float *const Work_L = Work; | |||
float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,351 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static int c__1 = 1; | |||
static float c_b8 = -1.f; | |||
static float c_b9 = 1.f; | |||
/** SSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method. | |||
* | |||
* This routine is a minor modification of LAPACK's slasyf. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int * | |||
nb, int *kb, float *a, int *lda, int *ipiv, float *w, | |||
int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; | |||
float r__1, r__2, r__3; | |||
/* Builtin functions */ | |||
double sqrt(double); | |||
/* Local variables */ | |||
static int j, k; | |||
static float t, r1, d11, d21, d22; | |||
static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
static float alpha; | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
extern /* Subroutine */ int sscal_(int *, float *, float *, int *), | |||
sgemv_(char *, int *, int *, float *, float *, int *, | |||
float *, int *, float *, float *, int *, ftnlen); | |||
static int kstep; | |||
extern /* Subroutine */ int scopy_(int *, float *, int *, float *, | |||
int *), sswap_(int *, float *, int *, float *, int * | |||
); | |||
static float absakk; | |||
extern int isamax_(int *, float *, int *); | |||
static float colmax, rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
} | |||
kstep = 1; | |||
absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = k - imax; | |||
scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
i__1 = k - imax; | |||
jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
&c__1); | |||
rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1)); | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
jmax = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
/* Computing MAX */ | |||
r__2 = rowmax, r__3 = (r__1 = w[jmax + (kw - 1) * w_dim1], | |||
dabs(r__1)); | |||
rowmax = dmax(r__2,r__3); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else if ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) >= | |||
alpha * rowmax) { | |||
kp = imax; | |||
scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kp != kk) { | |||
a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; | |||
i__1 = kk - 1 - kp; | |||
scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
if (kp > 1) { | |||
i__1 = kp - 1; | |||
scopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
+ 1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
sswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
+ 1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
r1 = 1.f / a[k + k * a_dim1]; | |||
i__1 = k - 1; | |||
sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
if (k > 2) { | |||
d21 = w[k - 1 + kw * w_dim1]; | |||
d11 = w[k + kw * w_dim1] / d21; | |||
d22 = w[k - 1 + (kw - 1) * w_dim1] / d21; | |||
t = 1.f / (d11 * d22 - 1.f); | |||
d21 = t / d21; | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1) | |||
* w_dim1] - w[j + kw * w_dim1]); | |||
a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] - | |||
w[j + (kw - 1) * w_dim1]); | |||
/* L20: */ | |||
} | |||
} | |||
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; | |||
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; | |||
a[k + k * a_dim1] = w[k + kw * w_dim1]; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
++j; | |||
} | |||
++j; | |||
if (jp != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
sswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
} | |||
if (j < *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
i__1 = *n - k + 1; | |||
scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k | |||
+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
kstep = 1; | |||
absakk = (r__1 = w[k + k * w_dim1], dabs(r__1)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
i__1 = imax - k; | |||
scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = *n - imax + 1; | |||
scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
1) * w_dim1], &c__1); | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], | |||
lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) * | |||
w_dim1], &c__1, (ftnlen)12); | |||
i__1 = imax - k; | |||
jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
; | |||
rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1)); | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
jmax = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
/* Computing MAX */ | |||
r__2 = rowmax, r__3 = (r__1 = w[jmax + (k + 1) * w_dim1], | |||
dabs(r__1)); | |||
rowmax = dmax(r__2,r__3); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else if ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) >= | |||
alpha * rowmax) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kp != kk) { | |||
a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; | |||
i__1 = kp - kk - 1; | |||
scopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
1) * a_dim1], lda); | |||
if (kp < *n) { | |||
i__1 = *n - kp; | |||
scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
+ kp * a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
sswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
} | |||
sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
r1 = 1.f / a[k + k * a_dim1]; | |||
i__1 = *n - k; | |||
sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
d21 = w[k + 1 + k * w_dim1]; | |||
d11 = w[k + 1 + (k + 1) * w_dim1] / d21; | |||
d22 = w[k + k * w_dim1] / d21; | |||
t = 1.f / (d11 * d22 - 1.f); | |||
d21 = t / d21; | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] - | |||
w[j + (k + 1) * w_dim1]); | |||
a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) * | |||
w_dim1] - w[j + k * w_dim1]); | |||
/* L80: */ | |||
} | |||
} | |||
a[k + k * a_dim1] = w[k + k * w_dim1]; | |||
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; | |||
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
--j; | |||
} | |||
--j; | |||
if (jp != jj && j >= 1) { | |||
sswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j > 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,236 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *, | |||
float *, const int *, int *, float *, const int *, int *); | |||
/** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's ssytrf_rook. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html | |||
* */ | |||
void RELAPACK_ssytrf_rook( | |||
const char *uplo, const int *n, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
float *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * sizeof(float)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("SSYTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy argument | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** ssytrf_rook's recursive compute kernel */ | |||
static void RELAPACK_ssytrf_rook_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
float *A, const int *ldA, int *ipiv, | |||
float *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_ssytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = SREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
float *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
float *const A_BL = A + n1; | |||
float *const A_BR = A + *ldA * n1 + n1; | |||
float *const A_BL_B = A + *n; | |||
float *const A_BR_B = A + *ldA * n1 + *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
float *const Work_BL = Work + n1; | |||
float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; | |||
// last row of A_BL | |||
float *const A_BL_b = A_BL + n2_out; | |||
// last row of Work_BL | |||
float *const Work_BL_b = Work_BL + n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = SREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
float *const Work_R = top ? Work : Work + *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
float *const A_TL_T = A + *ldA * n_rest; | |||
float *const A_TR_T = A + *ldA * (n_rest + n1); | |||
float *const A_TL = A + *ldA * n_rest + n_rest; | |||
float *const A_TR = A + *ldA * (n_rest + n1) + n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
float *const Work_L = Work; | |||
float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,451 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static int c__1 = 1; | |||
static float c_b9 = -1.f; | |||
static float c_b10 = 1.f; | |||
/** SSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method. | |||
* | |||
* This routine is a minor modification of LAPACK's slasyf_rook. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n, | |||
int *nb, int *kb, float *a, int *lda, int *ipiv, float * | |||
w, int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; | |||
float r__1; | |||
/* Builtin functions */ | |||
double sqrt(double); | |||
/* Local variables */ | |||
static int j, k, p; | |||
static float t, r1, d11, d12, d21, d22; | |||
static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
static logical done; | |||
static int imax, jmax; | |||
static float alpha; | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
extern /* Subroutine */ int sscal_(int *, float *, float *, int *); | |||
static float sfmin; | |||
static int itemp; | |||
extern /* Subroutine */ int sgemv_(char *, int *, int *, float *, | |||
float *, int *, float *, int *, float *, float *, int *, | |||
ftnlen); | |||
static int kstep; | |||
static float stemp; | |||
extern /* Subroutine */ int scopy_(int *, float *, int *, float *, | |||
int *), sswap_(int *, float *, int *, float *, int * | |||
); | |||
static float absakk; | |||
extern double slamch_(char *, ftnlen); | |||
extern int isamax_(int *, float *, int *); | |||
static float colmax, rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
sfmin = slamch_("S", (ftnlen)1); | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
kstep = 1; | |||
p = k; | |||
scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
} | |||
absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L12: | |||
scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = k - imax; | |||
scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
if (imax != k) { | |||
i__1 = k - imax; | |||
jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
w_dim1], &c__1); | |||
rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1)); | |||
} else { | |||
rowmax = 0.f; | |||
} | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
itemp = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
stemp = (r__1 = w[itemp + (kw - 1) * w_dim1], dabs(r__1)); | |||
if (stemp > rowmax) { | |||
rowmax = stemp; | |||
jmax = itemp; | |||
} | |||
} | |||
if (! ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) < | |||
alpha * rowmax)) { | |||
kp = imax; | |||
scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} | |||
if (! done) { | |||
goto L12; | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kstep == 2 && p != k) { | |||
i__1 = k - p; | |||
scopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
a_dim1], lda); | |||
scopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & | |||
c__1); | |||
i__1 = *n - k + 1; | |||
sswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], | |||
lda); | |||
i__1 = *n - kk + 1; | |||
sswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
ldw); | |||
} | |||
if (kp != kk) { | |||
a[kp + k * a_dim1] = a[kk + k * a_dim1]; | |||
i__1 = k - 1 - kp; | |||
scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
scopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & | |||
c__1); | |||
i__1 = *n - kk + 1; | |||
sswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], | |||
lda); | |||
i__1 = *n - kk + 1; | |||
sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
if (k > 1) { | |||
if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) { | |||
r1 = 1.f / a[k + k * a_dim1]; | |||
i__1 = k - 1; | |||
sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else if (a[k + k * a_dim1] != 0.f) { | |||
i__1 = k - 1; | |||
for (ii = 1; ii <= i__1; ++ii) { | |||
a[ii + k * a_dim1] /= a[k + k * a_dim1]; | |||
/* L14: */ | |||
} | |||
} | |||
} | |||
} else { | |||
if (k > 2) { | |||
d12 = w[k - 1 + kw * w_dim1]; | |||
d11 = w[k + kw * w_dim1] / d12; | |||
d22 = w[k - 1 + (kw - 1) * w_dim1] / d12; | |||
t = 1.f / (d11 * d22 - 1.f); | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) * | |||
w_dim1] - w[j + kw * w_dim1]) / d12); | |||
a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] - | |||
w[j + (kw - 1) * w_dim1]) / d12); | |||
/* L20: */ | |||
} | |||
} | |||
a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; | |||
a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; | |||
a[k + k * a_dim1] = w[k + kw * w_dim1]; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
++j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
++j; | |||
if (jp2 != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
sswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
jj = j - 1; | |||
if (jp1 != jj && kstep == 2) { | |||
i__1 = *n - j + 1; | |||
sswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
if (j <= *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
kstep = 1; | |||
p = k; | |||
i__1 = *n - k + 1; | |||
scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, & | |||
w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
absakk = (r__1 = w[k + k * w_dim1], dabs(r__1)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1)); | |||
} else { | |||
colmax = 0.f; | |||
} | |||
if (dmax(absakk,colmax) == 0.f) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = *n - k + 1; | |||
scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L72: | |||
i__1 = imax - k; | |||
scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = *n - imax + 1; | |||
scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
1) * w_dim1], &c__1); | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1] | |||
, lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k | |||
+ 1) * w_dim1], &c__1, (ftnlen)12); | |||
} | |||
if (imax != k) { | |||
i__1 = imax - k; | |||
jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
c__1); | |||
rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1)); | |||
} else { | |||
rowmax = 0.f; | |||
} | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
itemp = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
stemp = (r__1 = w[itemp + (k + 1) * w_dim1], dabs(r__1)); | |||
if (stemp > rowmax) { | |||
rowmax = stemp; | |||
jmax = itemp; | |||
} | |||
} | |||
if (! ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) < | |||
alpha * rowmax)) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
i__1 = *n - k + 1; | |||
scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
} | |||
if (! done) { | |||
goto L72; | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kstep == 2 && p != k) { | |||
i__1 = p - k; | |||
scopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], | |||
lda); | |||
i__1 = *n - p + 1; | |||
scopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & | |||
c__1); | |||
sswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
sswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
} | |||
if (kp != kk) { | |||
a[kp + k * a_dim1] = a[kk + k * a_dim1]; | |||
i__1 = kp - k - 1; | |||
scopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) | |||
* a_dim1], lda); | |||
i__1 = *n - kp + 1; | |||
scopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * | |||
a_dim1], &c__1); | |||
sswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) { | |||
r1 = 1.f / a[k + k * a_dim1]; | |||
i__1 = *n - k; | |||
sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} else if (a[k + k * a_dim1] != 0.f) { | |||
i__1 = *n; | |||
for (ii = k + 1; ii <= i__1; ++ii) { | |||
a[ii + k * a_dim1] /= a[k + k * a_dim1]; | |||
/* L74: */ | |||
} | |||
} | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
d21 = w[k + 1 + k * w_dim1]; | |||
d11 = w[k + 1 + (k + 1) * w_dim1] / d21; | |||
d22 = w[k + k * w_dim1] / d21; | |||
t = 1.f / (d11 * d22 - 1.f); | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[ | |||
j + (k + 1) * w_dim1]) / d21); | |||
a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) * | |||
w_dim1] - w[j + k * w_dim1]) / d21); | |||
/* L80: */ | |||
} | |||
} | |||
a[k + k * a_dim1] = w[k + k * w_dim1]; | |||
a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; | |||
a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
--j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
--j; | |||
if (jp2 != jj && j >= 1) { | |||
sswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
jj = j + 1; | |||
if (jp1 != jj && kstep == 2) { | |||
sswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j >= 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,274 @@ | |||
#include "relapack.h" | |||
#include <math.h> | |||
static void RELAPACK_stgsyl_rec(const char *, const int *, const int *, | |||
const int *, const float *, const int *, const float *, const int *, | |||
float *, const int *, const float *, const int *, const float *, | |||
const int *, float *, const int *, float *, float *, float *, int *, int *, | |||
int *); | |||
/** STGSYL solves the generalized Sylvester equation. | |||
* | |||
* This routine is functionally equivalent to LAPACK's stgsyl. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html | |||
* */ | |||
void RELAPACK_stgsyl( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, | |||
const float *D, const int *ldD, const float *E, const int *ldE, | |||
float *F, const int *ldF, | |||
float *scale, float *dif, | |||
float *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
// Parse arguments | |||
const int notran = LAPACK(lsame)(trans, "N"); | |||
const int tran = LAPACK(lsame)(trans, "T"); | |||
// Compute work buffer size | |||
int lwmin = 1; | |||
if (notran && (*ijob == 1 || *ijob == 2)) | |||
lwmin = MAX(1, 2 * *m * *n); | |||
*info = 0; | |||
// Check arguments | |||
if (!tran && !notran) | |||
*info = -1; | |||
else if (notran && (*ijob < 0 || *ijob > 4)) | |||
*info = -2; | |||
else if (*m <= 0) | |||
*info = -3; | |||
else if (*n <= 0) | |||
*info = -4; | |||
else if (*ldA < MAX(1, *m)) | |||
*info = -6; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -8; | |||
else if (*ldC < MAX(1, *m)) | |||
*info = -10; | |||
else if (*ldD < MAX(1, *m)) | |||
*info = -12; | |||
else if (*ldE < MAX(1, *n)) | |||
*info = -14; | |||
else if (*ldF < MAX(1, *m)) | |||
*info = -16; | |||
else if (*lWork < lwmin && *lWork != -1) | |||
*info = -20; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("STGSYL", &minfo); | |||
return; | |||
} | |||
if (*lWork == -1) { | |||
// Work size query | |||
*Work = lwmin; | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleantrans = notran ? 'N' : 'T'; | |||
// Constant | |||
const float ZERO[] = { 0. }; | |||
int isolve = 1; | |||
int ifunc = 0; | |||
if (notran) { | |||
if (*ijob >= 3) { | |||
ifunc = *ijob - 2; | |||
LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
} else if (*ijob >= 1) | |||
isolve = 2; | |||
} | |||
float scale2; | |||
int iround; | |||
for (iround = 1; iround <= isolve; iround++) { | |||
*scale = 1; | |||
float dscale = 0; | |||
float dsum = 1; | |||
int pq; | |||
RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); | |||
if (dscale != 0) { | |||
if (*ijob == 1 || *ijob == 3) | |||
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); | |||
else | |||
*dif = sqrt(pq) / (dscale * sqrt(dsum)); | |||
} | |||
if (isolve == 2) { | |||
if (iround == 1) { | |||
if (notran) | |||
ifunc = *ijob; | |||
scale2 = *scale; | |||
LAPACK(slacpy)("F", m, n, C, ldC, Work, m); | |||
LAPACK(slacpy)("F", m, n, F, ldF, Work + *m * *n, m); | |||
LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
} else { | |||
LAPACK(slacpy)("F", m, n, Work, m, C, ldC); | |||
LAPACK(slacpy)("F", m, n, Work + *m * *n, m, F, ldF); | |||
*scale = scale2; | |||
} | |||
} | |||
} | |||
} | |||
/** stgsyl's recursive vompute kernel */ | |||
static void RELAPACK_stgsyl_rec( | |||
const char *trans, const int *ifunc, const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, | |||
const float *D, const int *ldD, const float *E, const int *ldE, | |||
float *F, const int *ldF, | |||
float *scale, float *dsum, float *dscale, | |||
int *iWork, int *pq, int *info | |||
) { | |||
if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) { | |||
// Unblocked | |||
LAPACK(stgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
const int iONE[] = { 1 }; | |||
// Outputs | |||
float scale1[] = { 1. }; | |||
float scale2[] = { 1. }; | |||
int info1[] = { 0 }; | |||
int info2[] = { 0 }; | |||
if (*m > *n) { | |||
// Splitting | |||
int m1 = SREC_SPLIT(*m); | |||
if (A[m1 + *ldA * (m1 - 1)]) | |||
m1++; | |||
const int m2 = *m - m1; | |||
// A_TL A_TR | |||
// 0 A_BR | |||
const float *const A_TL = A; | |||
const float *const A_TR = A + *ldA * m1; | |||
const float *const A_BR = A + *ldA * m1 + m1; | |||
// C_T | |||
// C_B | |||
float *const C_T = C; | |||
float *const C_B = C + m1; | |||
// D_TL D_TR | |||
// 0 D_BR | |||
const float *const D_TL = D; | |||
const float *const D_TR = D + *ldD * m1; | |||
const float *const D_BR = D + *ldD * m1 + m1; | |||
// F_T | |||
// F_B | |||
float *const F_T = F; | |||
float *const F_B = F + m1; | |||
if (*trans == 'N') { | |||
// recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
// C_T = C_T - A_TR * C_B | |||
BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
// F_T = F_T - D_TR * C_B | |||
BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); | |||
// recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); | |||
} | |||
} else { | |||
// recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
// apply scale | |||
if (scale1[0] != 1) | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); | |||
// C_B = C_B - A_TR^H * C_T | |||
BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
// C_B = C_B - D_TR^H * F_T | |||
BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); | |||
// recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); | |||
} | |||
} | |||
} else { | |||
// Splitting | |||
int n1 = SREC_SPLIT(*n); | |||
if (B[n1 + *ldB * (n1 - 1)]) | |||
n1++; | |||
const int n2 = *n - n1; | |||
// B_TL B_TR | |||
// 0 B_BR | |||
const float *const B_TL = B; | |||
const float *const B_TR = B + *ldB * n1; | |||
const float *const B_BR = B + *ldB * n1 + n1; | |||
// C_L C_R | |||
float *const C_L = C; | |||
float *const C_R = C + *ldC * n1; | |||
// E_TL E_TR | |||
// 0 E_BR | |||
const float *const E_TL = E; | |||
const float *const E_TR = E + *ldE * n1; | |||
const float *const E_BR = E + *ldE * n1 + n1; | |||
// F_L F_R | |||
float *const F_L = F; | |||
float *const F_R = F + *ldF * n1; | |||
if (*trans == 'N') { | |||
// recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
// C_R = C_R + F_L * B_TR | |||
BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); | |||
// F_R = F_R + F_L * E_TR | |||
BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); | |||
// recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); | |||
} | |||
} else { | |||
// recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
// apply scale | |||
if (scale1[0] != 1) | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); | |||
// F_L = F_L + C_R * B_TR | |||
BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); | |||
// F_L = F_L + F_R * E_TR | |||
BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); | |||
// recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); | |||
} | |||
} | |||
} | |||
*scale = scale1[0] * scale2[0]; | |||
*info = info1[0] || info2[0]; | |||
} |
@@ -0,0 +1,169 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_strsyl_rec(const char *, const char *, const int *, | |||
const int *, const int *, const float *, const int *, const float *, | |||
const int *, float *, const int *, float *, int *); | |||
/** STRSYL solves the real Sylvester matrix equation. | |||
* | |||
* This routine is functionally equivalent to LAPACK's strsyl. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html | |||
* */ | |||
void RELAPACK_strsyl( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, float *scale, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int notransA = LAPACK(lsame)(tranA, "N"); | |||
const int transA = LAPACK(lsame)(tranA, "T"); | |||
const int ctransA = LAPACK(lsame)(tranA, "C"); | |||
const int notransB = LAPACK(lsame)(tranB, "N"); | |||
const int transB = LAPACK(lsame)(tranB, "T"); | |||
const int ctransB = LAPACK(lsame)(tranB, "C"); | |||
*info = 0; | |||
if (!transA && !ctransA && !notransA) | |||
*info = -1; | |||
else if (!transB && !ctransB && !notransB) | |||
*info = -2; | |||
else if (*isgn != 1 && *isgn != -1) | |||
*info = -3; | |||
else if (*m < 0) | |||
*info = -4; | |||
else if (*n < 0) | |||
*info = -5; | |||
else if (*ldA < MAX(1, *m)) | |||
*info = -7; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -9; | |||
else if (*ldC < MAX(1, *m)) | |||
*info = -11; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("STRSYL", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); | |||
const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); | |||
// Recursive kernel | |||
RELAPACK_strsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
/** strsyl's recursive compute kernel */ | |||
static void RELAPACK_strsyl_rec( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const float *A, const int *ldA, const float *B, const int *ldB, | |||
float *C, const int *ldC, float *scale, | |||
int *info | |||
) { | |||
if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) { | |||
// Unblocked | |||
RELAPACK_strsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
const float MSGN[] = { -*isgn }; | |||
const int iONE[] = { 1 }; | |||
// Outputs | |||
float scale1[] = { 1. }; | |||
float scale2[] = { 1. }; | |||
int info1[] = { 0 }; | |||
int info2[] = { 0 }; | |||
if (*m > *n) { | |||
// Splitting | |||
int m1 = SREC_SPLIT(*m); | |||
if (A[m1 + *ldA * (m1 - 1)]) | |||
m1++; | |||
const int m2 = *m - m1; | |||
// A_TL A_TR | |||
// 0 A_BR | |||
const float *const A_TL = A; | |||
const float *const A_TR = A + *ldA * m1; | |||
const float *const A_BR = A + *ldA * m1 + m1; | |||
// C_T | |||
// C_B | |||
float *const C_T = C; | |||
float *const C_B = C + m1; | |||
if (*tranA == 'N') { | |||
// recusion(A_BR, B, C_B) | |||
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); | |||
// C_T = C_T - A_TR * C_B | |||
BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
// recusion(A_TL, B, C_T) | |||
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
} else { | |||
// recusion(A_TL, B, C_T) | |||
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); | |||
// C_B = C_B - A_TR' * C_T | |||
BLAS(sgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
// recusion(A_BR, B, C_B) | |||
RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); | |||
} | |||
} else { | |||
// Splitting | |||
int n1 = SREC_SPLIT(*n); | |||
if (B[n1 + *ldB * (n1 - 1)]) | |||
n1++; | |||
const int n2 = *n - n1; | |||
// B_TL B_TR | |||
// 0 B_BR | |||
const float *const B_TL = B; | |||
const float *const B_TR = B + *ldB * n1; | |||
const float *const B_BR = B + *ldB * n1 + n1; | |||
// C_L C_R | |||
float *const C_L = C; | |||
float *const C_R = C + *ldC * n1; | |||
if (*tranB == 'N') { | |||
// recusion(A, B_TL, C_L) | |||
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); | |||
// C_R = C_R -/+ C_L * B_TR | |||
BLAS(sgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); | |||
// recusion(A, B_BR, C_R) | |||
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
} else { | |||
// recusion(A, B_BR, C_R) | |||
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); | |||
// C_L = C_L -/+ C_R * B_TR' | |||
BLAS(sgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); | |||
// recusion(A, B_TL, C_L) | |||
RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
} | |||
} | |||
*scale = scale1[0] * scale2[0]; | |||
*info = info1[0] || info2[0]; | |||
} |
@@ -0,0 +1,107 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_strtri_rec(const char *, const char *, const int *, | |||
float *, const int *, int *); | |||
/** CTRTRI computes the inverse of a real upper or lower triangular matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's strtri. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html | |||
* */ | |||
void RELAPACK_strtri( | |||
const char *uplo, const char *diag, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
const int nounit = LAPACK(lsame)(diag, "N"); | |||
const int unit = LAPACK(lsame)(diag, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (!nounit && !unit) | |||
*info = -2; | |||
else if (*n < 0) | |||
*info = -3; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -5; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("STRTRI", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
const char cleandiag = nounit ? 'N' : 'U'; | |||
// check for singularity | |||
if (nounit) { | |||
int i; | |||
for (i = 0; i < *n; i++) | |||
if (A[i + *ldA * i] == 0) { | |||
*info = i; | |||
return; | |||
} | |||
} | |||
// Recursive kernel | |||
RELAPACK_strtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); | |||
} | |||
/** strtri's recursive compute kernel */ | |||
static void RELAPACK_strtri_rec( | |||
const char *uplo, const char *diag, const int *n, | |||
float *A, const int *ldA, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_STRTRI, 1)) { | |||
// Unblocked | |||
LAPACK(strti2)(uplo, diag, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const float ONE[] = { 1. }; | |||
const float MONE[] = { -1. }; | |||
// Splitting | |||
const int n1 = SREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
float *const A_TL = A; | |||
float *const A_TR = A + *ldA * n1; | |||
float *const A_BL = A + n1; | |||
float *const A_BR = A + *ldA * n1 + n1; | |||
// recursion(A_TL) | |||
RELAPACK_strtri_rec(uplo, diag, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
if (*uplo == 'L') { | |||
// A_BL = - A_BL * A_TL | |||
BLAS(strmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); | |||
// A_BL = A_BR \ A_BL | |||
BLAS(strsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
} else { | |||
// A_TR = - A_TL * A_TR | |||
BLAS(strmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); | |||
// A_TR = A_TR / A_BR | |||
BLAS(strsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_strtri_rec(uplo, diag, &n2, A_BR, ldA, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,230 @@ | |||
#include "relapack.h" | |||
#include "stdlib.h" | |||
static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *, | |||
const int *, double *, const int *, int *, double *, const int *, double *, | |||
const int *, int *); | |||
/** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zgbtrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html | |||
* */ | |||
void RELAPACK_zgbtrf( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
double *Ab, const int *ldAb, int *ipiv, | |||
int *info | |||
) { | |||
// Check arguments | |||
*info = 0; | |||
if (*m < 0) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*kl < 0) | |||
*info = -3; | |||
else if (*ku < 0) | |||
*info = -4; | |||
else if (*ldAb < 2 * *kl + *ku + 1) | |||
*info = -6; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZGBTRF", &minfo); | |||
return; | |||
} | |||
// Constant | |||
const double ZERO[] = { 0., 0. }; | |||
// Result upper band width | |||
const int kv = *ku + *kl; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
double *const A = Ab + 2 * kv; | |||
// Zero upper diagonal fill-in elements | |||
int i, j; | |||
for (j = 0; j < *n; j++) { | |||
double *const A_j = A + 2 * *ldA * j; | |||
for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
A_j[2 * i] = A_j[2 * i + 1] = 0.; | |||
} | |||
// Allocate work space | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
const int nWorkl = (kv > n1) ? n1 : kv; | |||
const int mWorku = (*kl > n1) ? n1 : *kl; | |||
const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); | |||
double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); | |||
LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
LAPACK(zlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
// Recursive kernel | |||
RELAPACK_zgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
// Free work space | |||
free(Workl); | |||
free(Worku); | |||
} | |||
/** zgbtrf's recursive compute kernel */ | |||
static void RELAPACK_zgbtrf_rec( | |||
const int *m, const int *n, const int *kl, const int *ku, | |||
double *Ab, const int *ldAb, int *ipiv, | |||
double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) { | |||
// Unblocked | |||
LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterators | |||
int i, j; | |||
// Output upper band width | |||
const int kv = *ku + *kl; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
double *const A = Ab + 2 * kv; | |||
// Splitting | |||
const int n1 = MIN(ZREC_SPLIT(*n), *kl); | |||
const int n2 = *n - n1; | |||
const int m1 = MIN(n1, *m); | |||
const int m2 = *m - m1; | |||
const int mn1 = MIN(m1, n1); | |||
const int mn2 = MIN(m2, n2); | |||
// Ab_L * | |||
// Ab_BR | |||
double *const Ab_L = Ab; | |||
double *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
// A_L A_R | |||
double *const A_L = A; | |||
double *const A_R = A + 2 * *ldA * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + 2 * *ldA * n1; | |||
double *const A_BL = A + 2 * m1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * m1; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_T = ipiv; | |||
int *const ipiv_B = ipiv + n1; | |||
// Banded splitting | |||
const int n21 = MIN(n2, kv - n1); | |||
const int n22 = MIN(n2 - n21, n1); | |||
const int m21 = MIN(m2, *kl - m1); | |||
const int m22 = MIN(m2 - m21, m1); | |||
// n1 n21 n22 | |||
// m * A_Rl ARr | |||
double *const A_Rl = A_R; | |||
double *const A_Rr = A_R + 2 * *ldA * n21; | |||
// n1 n21 n22 | |||
// m1 * A_TRl A_TRr | |||
// m21 A_BLt A_BRtl A_BRtr | |||
// m22 A_BLb A_BRbl A_BRbr | |||
double *const A_TRl = A_TR; | |||
double *const A_TRr = A_TR + 2 * *ldA * n21; | |||
double *const A_BLt = A_BL; | |||
double *const A_BLb = A_BL + 2 * m21; | |||
double *const A_BRtl = A_BR; | |||
double *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
double *const A_BRbl = A_BR + 2 * m21; | |||
double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21; | |||
// recursion(Ab_L, ipiv_T) | |||
RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
// Workl = A_BLb | |||
LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
// partially redo swaps in A_L | |||
for (i = 0; i < mn1; i++) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
if (ip < *kl) | |||
BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
else | |||
BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
} | |||
} | |||
// apply pivots to A_Rl | |||
LAPACK(zlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
// apply pivots to A_Rr columnwise | |||
for (j = 0; j < n22; j++) { | |||
double *const A_Rrj = A_Rr + 2 * *ldA * j; | |||
for (i = j; i < mn1; i++) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
const double tmpr = A_Rrj[2 * i]; | |||
const double tmpc = A_Rrj[2 * i + 1]; | |||
A_Rrj[2 * i] = A_Rrj[2 * ip]; | |||
A_Rrj[2 * i + 1] = A_Rrj[2 * ip + 1]; | |||
A_Rrj[2 * ip] = tmpr; | |||
A_Rrj[2 * ip + 1] = tmpc; | |||
} | |||
} | |||
} | |||
// A_TRl = A_TL \ A_TRl | |||
BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
// Worku = A_TRr | |||
LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
// Worku = A_TL \ Worku | |||
BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
// A_TRr = Worku | |||
LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
// A_BRtl = A_BRtl - A_BLt * A_TRl | |||
BLAS(zgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
// A_BRbl = A_BRbl - Workl * A_TRl | |||
BLAS(zgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
// A_BRtr = A_BRtr - A_BLt * Worku | |||
BLAS(zgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
// A_BRbr = A_BRbr - Workl * Worku | |||
BLAS(zgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
// partially undo swaps in A_L | |||
for (i = mn1 - 1; i >= 0; i--) { | |||
const int ip = ipiv_T[i] - 1; | |||
if (ip != i) { | |||
if (ip < *kl) | |||
BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
else | |||
BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
} | |||
} | |||
// recursion(Ab_BR, ipiv_B) | |||
RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
if (*info) | |||
*info += n1; | |||
// shift pivots | |||
for (i = 0; i < mn2; i++) | |||
ipiv_B[i] += n1; | |||
} |
@@ -0,0 +1,167 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_zgemmt_rec(const char *, const char *, const char *, | |||
const int *, const int *, const double *, const double *, const int *, | |||
const double *, const int *, const double *, double *, const int *); | |||
static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, | |||
const int *, const int *, const double *, const double *, const int *, | |||
const double *, const int *, const double *, double *, const int *); | |||
/** ZGEMMT computes a matrix-matrix product with general matrices but updates | |||
* only the upper or lower triangular part of the result matrix. | |||
* | |||
* This routine performs the same operation as the BLAS routine | |||
* zgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
* but only updates the triangular part of C specified by uplo: | |||
* If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
* otherwise the upper triangular part is updated. | |||
* */ | |||
void RELAPACK_zgemmt( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
#if HAVE_XGEMMT | |||
BLAS(zgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
return; | |||
#else | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
const int notransA = LAPACK(lsame)(transA, "N"); | |||
const int tranA = LAPACK(lsame)(transA, "T"); | |||
const int ctransA = LAPACK(lsame)(transA, "C"); | |||
const int notransB = LAPACK(lsame)(transB, "N"); | |||
const int tranB = LAPACK(lsame)(transB, "T"); | |||
const int ctransB = LAPACK(lsame)(transB, "C"); | |||
int info = 0; | |||
if (!lower && !upper) | |||
info = 1; | |||
else if (!tranA && !ctransA && !notransA) | |||
info = 2; | |||
else if (!tranB && !ctransB && !notransB) | |||
info = 3; | |||
else if (*n < 0) | |||
info = 4; | |||
else if (*k < 0) | |||
info = 5; | |||
else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
info = 8; | |||
else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
info = 10; | |||
else if (*ldC < MAX(1, *n)) | |||
info = 13; | |||
if (info) { | |||
LAPACK(xerbla)("ZGEMMT", &info); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C'); | |||
const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C'); | |||
// Recursive kernel | |||
RELAPACK_zgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
#endif | |||
} | |||
/** zgemmt's recursive compute kernel */ | |||
static void RELAPACK_zgemmt_rec( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) { | |||
// Unblocked | |||
RELAPACK_zgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
return; | |||
} | |||
// Splitting | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_T | |||
// A_B | |||
const double *const A_T = A; | |||
const double *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1); | |||
// B_L B_R | |||
const double *const B_L = B; | |||
const double *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1); | |||
// C_TL C_TR | |||
// C_BL C_BR | |||
double *const C_TL = C; | |||
double *const C_TR = C + 2 * *ldC * n1; | |||
double *const C_BL = C + 2 * n1; | |||
double *const C_BR = C + 2 * *ldC * n1 + 2 * n1; | |||
// recursion(C_TL) | |||
RELAPACK_zgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
if (*uplo == 'L') | |||
// C_BL = alpha A_B B_L + beta C_BL | |||
BLAS(zgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
else | |||
// C_TR = alpha A_T B_R + beta C_TR | |||
BLAS(zgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
// recursion(C_BR) | |||
RELAPACK_zgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
} | |||
/** zgemmt's unblocked compute kernel */ | |||
static void RELAPACK_zgemmt_rec2( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const double *alpha, const double *A, const int *ldA, | |||
const double *B, const int *ldB, | |||
const double *beta, double *C, const int *ldC | |||
) { | |||
const int incB = (*transB == 'N') ? 1 : *ldB; | |||
const int incC = 1; | |||
int i; | |||
for (i = 0; i < *n; i++) { | |||
// A_0 | |||
// A_i | |||
const double *const A_0 = A; | |||
const double *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i); | |||
// * B_i * | |||
const double *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i); | |||
// * C_0i * | |||
// * C_ii * | |||
double *const C_0i = C + 2 * *ldC * i; | |||
double *const C_ii = C + 2 * *ldC * i + 2 * i; | |||
if (*uplo == 'L') { | |||
const int nmi = *n - i; | |||
if (*transA == 'N') | |||
BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
else | |||
BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
} else { | |||
const int ip1 = i + 1; | |||
if (*transA == 'N') | |||
BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
else | |||
BLAS(zgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
} | |||
} | |||
} |
@@ -0,0 +1,117 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_zgetrf_rec(const int *, const int *, double *, | |||
const int *, int *, int *); | |||
/** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zgetrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html | |||
* */ | |||
void RELAPACK_zgetrf( | |||
const int *m, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
// Check arguments | |||
*info = 0; | |||
if (*m < 0) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZGETRF", &minfo); | |||
return; | |||
} | |||
const int sn = MIN(*m, *n); | |||
RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
// Right remainder | |||
if (*m < *n) { | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const int iONE[] = { 1 }; | |||
// Splitting | |||
const int rn = *n - *m; | |||
// A_L A_R | |||
const double *const A_L = A; | |||
double *const A_R = A + 2 * *ldA * *m; | |||
// A_R = apply(ipiv, A_R) | |||
LAPACK(zlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
// A_R = A_L \ A_R | |||
BLAS(ztrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
} | |||
} | |||
/** zgetrf's recursive compute kernel */ | |||
static void RELAPACK_zgetrf_rec( | |||
const int *m, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_ZGETRF, 1)) { | |||
// Unblocked | |||
LAPACK(zgetf2)(m, n, A, ldA, ipiv, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1. }; | |||
// Splitting | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
const int m2 = *m - n1; | |||
// A_L A_R | |||
double *const A_L = A; | |||
double *const A_R = A + 2 * *ldA * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + 2 * *ldA * n1; | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_T = ipiv; | |||
int *const ipiv_B = ipiv + n1; | |||
// recursion(A_L, ipiv_T) | |||
RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
// apply pivots to A_R | |||
LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
// A_TR = A_TL \ A_TR | |||
BLAS(ztrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
// A_BR = A_BR - A_BL * A_TR | |||
BLAS(zgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
// recursion(A_BR, ipiv_B) | |||
RELAPACK_zgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
if (*info) | |||
*info += n1; | |||
// apply pivots to A_BL | |||
LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
ipiv_B[i] += n1; | |||
} |
@@ -0,0 +1,212 @@ | |||
#include "relapack.h" | |||
#if XSYGST_ALLOW_MALLOC | |||
#include "stdlib.h" | |||
#endif | |||
static void RELAPACK_zhegst_rec(const int *, const char *, const int *, | |||
double *, const int *, const double *, const int *, | |||
double *, const int *, int *); | |||
/** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zhegst. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html | |||
* */ | |||
void RELAPACK_zhegst( | |||
const int *itype, const char *uplo, const int *n, | |||
double *A, const int *ldA, const double *B, const int *ldB, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (*itype < 1 || *itype > 3) | |||
*info = -1; | |||
else if (!lower && !upper) | |||
*info = -2; | |||
else if (*n < 0) | |||
*info = -3; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -5; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -7; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZHEGST", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Allocate work space | |||
double *Work = NULL; | |||
int lWork = 0; | |||
#if XSYGST_ALLOW_MALLOC | |||
const int n1 = ZREC_SPLIT(*n); | |||
lWork = n1 * (*n - n1); | |||
Work = malloc(lWork * 2 * sizeof(double)); | |||
if (!Work) | |||
lWork = 0; | |||
#endif | |||
// recursive kernel | |||
RELAPACK_zhegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); | |||
// Free work space | |||
#if XSYGST_ALLOW_MALLOC | |||
if (Work) | |||
free(Work); | |||
#endif | |||
} | |||
/** zhegst's recursive compute kernel */ | |||
static void RELAPACK_zhegst_rec( | |||
const int *itype, const char *uplo, const int *n, | |||
double *A, const int *ldA, const double *B, const int *ldB, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_ZHEGST, 1)) { | |||
// Unblocked | |||
LAPACK(zhegs2)(itype, uplo, n, A, ldA, B, ldB, info); | |||
return; | |||
} | |||
// Constants | |||
const double ZERO[] = { 0., 0. }; | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
const double HALF[] = { .5, 0. }; | |||
const double MHALF[] = { -.5, 0. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterator | |||
int i; | |||
// Splitting | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + 2 * *ldA * n1; | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// B_TL B_TR | |||
// B_BL B_BR | |||
const double *const B_TL = B; | |||
const double *const B_TR = B + 2 * *ldB * n1; | |||
const double *const B_BL = B + 2 * n1; | |||
const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
// recursion(A_TL, B_TL) | |||
RELAPACK_zhegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); | |||
if (*itype == 1) | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL / B_TL' | |||
BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
if (*lWork >= n2 * n1) { | |||
// T = -1/2 * B_BL * A_TL | |||
BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
} else | |||
// A_BL = A_BL - 1/2 B_BL * A_TL | |||
BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' | |||
BLAS(zher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); | |||
if (*lWork >= n2 * n1) | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
else | |||
// A_BL = A_BL - 1/2 B_BL * A_TL | |||
BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BL = B_BR \ A_BL | |||
BLAS(ztrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
} else { | |||
// A_TR = B_TL' \ A_TR | |||
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
if (*lWork >= n2 * n1) { | |||
// T = -1/2 * A_TL * B_TR | |||
BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
// A_TR = A_BL + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
} else | |||
// A_TR = A_TR - 1/2 A_TL * B_TR | |||
BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR | |||
BLAS(zher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); | |||
if (*lWork >= n2 * n1) | |||
// A_TR = A_BL + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
else | |||
// A_TR = A_TR - 1/2 A_TL * B_TR | |||
BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TR = A_TR / B_BR | |||
BLAS(ztrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
} | |||
else | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL * B_TL | |||
BLAS(ztrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
if (*lWork >= n2 * n1) { | |||
// T = 1/2 * A_BR * B_BL | |||
BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
} else | |||
// A_BL = A_BL + 1/2 A_BR * B_BL | |||
BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL | |||
BLAS(zher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); | |||
if (*lWork >= n2 * n1) | |||
// A_BL = A_BL + T | |||
for (i = 0; i < n1; i++) | |||
BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
else | |||
// A_BL = A_BL + 1/2 A_BR * B_BL | |||
BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
// A_BL = B_BR * A_BL | |||
BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
} else { | |||
// A_TR = B_TL * A_TR | |||
BLAS(ztrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
if (*lWork >= n2 * n1) { | |||
// T = 1/2 * B_TR * A_BR | |||
BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
// A_TR = A_TR + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
} else | |||
// A_TR = A_TR + 1/2 B_TR A_BR | |||
BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' | |||
BLAS(zher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); | |||
if (*lWork >= n2 * n1) | |||
// A_TR = A_TR + T | |||
for (i = 0; i < n2; i++) | |||
BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
else | |||
// A_TR = A_TR + 1/2 B_TR * A_BR | |||
BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
// A_TR = A_TR * B_BR | |||
BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
} | |||
// recursion(A_BR, B_BR) | |||
RELAPACK_zhegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); | |||
} |
@@ -0,0 +1,236 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *, | |||
double *, const int *, int *, double *, const int *, int *); | |||
/** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zhetrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html | |||
* */ | |||
void RELAPACK_zhetrf( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
double *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * 2 * sizeof(double)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZHETRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy argument | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** zhetrf's recursive compute kernel */ | |||
static void RELAPACK_zhetrf_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_ZHETRF, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_zhetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = ZREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
double *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
double *const A_BL_B = A + 2 * *n; | |||
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
double *const Work_BL = Work + 2 * n1; | |||
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
// last row of A_BL | |||
double *const A_BL_b = A_BL + 2 * n2_out; | |||
// last row of Work_BL | |||
double *const Work_BL_b = Work_BL + 2 * n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = ZREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
double *const A_TL_T = A + 2 * *ldA * n_rest; | |||
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
double *const Work_L = Work; | |||
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,524 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static doublecomplex c_b1 = {1.,0.}; | |||
static int c__1 = 1; | |||
/** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method | |||
* | |||
* This routine is a minor modification of LAPACK's zlahef. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int * | |||
nb, int *kb, doublecomplex *a, int *lda, int *ipiv, | |||
doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
double d__1, d__2, d__3, d__4; | |||
doublecomplex z__1, z__2, z__3, z__4; | |||
/* Builtin functions */ | |||
double sqrt(double), d_imag(doublecomplex *); | |||
void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *, | |||
doublecomplex *, doublecomplex *); | |||
/* Local variables */ | |||
static int j, k; | |||
static double t, r1; | |||
static doublecomplex d11, d21, d22; | |||
static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
static double alpha; | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
static int kstep; | |||
extern /* Subroutine */ int zgemv_(char *, int *, int *, | |||
doublecomplex *, doublecomplex *, int *, doublecomplex *, | |||
int *, doublecomplex *, doublecomplex *, int *, ftnlen), | |||
zcopy_(int *, doublecomplex *, int *, doublecomplex *, | |||
int *), zswap_(int *, doublecomplex *, int *, | |||
doublecomplex *, int *); | |||
static double absakk; | |||
extern /* Subroutine */ int zdscal_(int *, double *, | |||
doublecomplex *, int *); | |||
static double colmax; | |||
extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) | |||
; | |||
extern int izamax_(int *, doublecomplex *, int *); | |||
static double rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.) + 1.) / 8.; | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
kstep = 1; | |||
i__1 = k - 1; | |||
zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = k + kw * w_dim1; | |||
i__2 = k + k * a_dim1; | |||
d__1 = a[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
i__1 = k + kw * w_dim1; | |||
i__2 = k + kw * w_dim1; | |||
d__1 = w[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
} | |||
i__1 = k + kw * w_dim1; | |||
absakk = (d__1 = w[i__1].r, abs(d__1)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = imax + kw * w_dim1; | |||
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
kw * w_dim1]), abs(d__2)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * a_dim1; | |||
d__1 = a[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
i__1 = imax - 1; | |||
zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
i__2 = imax + imax * a_dim1; | |||
d__1 = a[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
i__1 = k - imax; | |||
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
i__1 = k - imax; | |||
zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
i__2 = imax + (kw - 1) * w_dim1; | |||
d__1 = w[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
} | |||
i__1 = k - imax; | |||
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
&c__1); | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
jmax + (kw - 1) * w_dim1]), abs(d__2)); | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
/* Computing MAX */ | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( | |||
d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs( | |||
d__2)); | |||
rowmax = max(d__3,d__4); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else /* if(complicated condition) */ { | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) { | |||
kp = imax; | |||
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
d__1 = a[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
i__1 = kk - 1 - kp; | |||
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
i__1 = kk - 1 - kp; | |||
zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
if (kp > 1) { | |||
i__1 = kp - 1; | |||
zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
+ 1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
+ 1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
if (k > 1) { | |||
i__1 = k + k * a_dim1; | |||
r1 = 1. / a[i__1].r; | |||
i__1 = k - 1; | |||
zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
i__1 = k - 1; | |||
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
} | |||
} else { | |||
if (k > 2) { | |||
i__1 = k - 1 + kw * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
d_cnjg(&z__2, &d21); | |||
z_div(&z__1, &w[k + kw * w_dim1], &z__2); | |||
d11.r = z__1.r, d11.i = z__1.i; | |||
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
d22.r = z__1.r, d22.i = z__1.i; | |||
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
t = 1. / (z__1.r - 1.); | |||
z__2.r = t, z__2.i = 0.; | |||
z_div(&z__1, &z__2, &d21); | |||
d21.r = z__1.r, d21.i = z__1.i; | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
i__2 = j + (k - 1) * a_dim1; | |||
i__3 = j + (kw - 1) * w_dim1; | |||
z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + kw * w_dim1; | |||
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
.i; | |||
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
d21.r * z__2.i + d21.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
i__2 = j + k * a_dim1; | |||
d_cnjg(&z__2, &d21); | |||
i__3 = j + kw * w_dim1; | |||
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + (kw - 1) * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = | |||
z__2.r * z__3.i + z__2.i * z__3.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L20: */ | |||
} | |||
} | |||
i__1 = k - 1 + (k - 1) * a_dim1; | |||
i__2 = k - 1 + (kw - 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1 + k * a_dim1; | |||
i__2 = k - 1 + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1; | |||
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = k - 2; | |||
zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
++j; | |||
} | |||
++j; | |||
if (jp != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
} | |||
if (j < *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
kstep = 1; | |||
i__1 = k + k * w_dim1; | |||
i__2 = k + k * a_dim1; | |||
d__1 = a[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
w_dim1], &c__1); | |||
} | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k | |||
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
i__1 = k + k * w_dim1; | |||
i__2 = k + k * w_dim1; | |||
d__1 = w[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
i__1 = k + k * w_dim1; | |||
absakk = (d__1 = w[i__1].r, abs(d__1)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = imax + k * w_dim1; | |||
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
k * w_dim1]), abs(d__2)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * a_dim1; | |||
d__1 = a[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
i__1 = imax - k; | |||
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = imax - k; | |||
zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
i__1 = imax + (k + 1) * w_dim1; | |||
i__2 = imax + imax * a_dim1; | |||
d__1 = a[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
imax + 1 + (k + 1) * w_dim1], &c__1); | |||
} | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], | |||
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
w_dim1], &c__1, (ftnlen)12); | |||
i__1 = imax + (k + 1) * w_dim1; | |||
i__2 = imax + (k + 1) * w_dim1; | |||
d__1 = w[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
i__1 = imax - k; | |||
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
; | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
jmax + (k + 1) * w_dim1]), abs(d__2)); | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
/* Computing MAX */ | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( | |||
d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs( | |||
d__2)); | |||
rowmax = max(d__3,d__4); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else /* if(complicated condition) */ { | |||
i__1 = imax + (k + 1) * w_dim1; | |||
if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
k * w_dim1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
d__1 = a[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
i__1 = kp - kk - 1; | |||
zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
1) * a_dim1], lda); | |||
i__1 = kp - kk - 1; | |||
zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
if (kp < *n) { | |||
i__1 = *n - kp; | |||
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
+ kp * a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
} | |||
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
i__1 = k + k * a_dim1; | |||
r1 = 1. / a[i__1].r; | |||
i__1 = *n - k; | |||
zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
i__1 = *n - k; | |||
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
i__1 = k + 1 + k * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
d11.r = z__1.r, d11.i = z__1.i; | |||
d_cnjg(&z__2, &d21); | |||
z_div(&z__1, &w[k + k * w_dim1], &z__2); | |||
d22.r = z__1.r, d22.i = z__1.i; | |||
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
t = 1. / (z__1.r - 1.); | |||
z__2.r = t, z__2.i = 0.; | |||
z_div(&z__1, &z__2, &d21); | |||
d21.r = z__1.r, d21.i = z__1.i; | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
i__2 = j + k * a_dim1; | |||
d_cnjg(&z__2, &d21); | |||
i__3 = j + k * w_dim1; | |||
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + (k + 1) * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = | |||
z__2.r * z__3.i + z__2.i * z__3.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
i__2 = j + (k + 1) * a_dim1; | |||
i__3 = j + (k + 1) * w_dim1; | |||
z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + k * w_dim1; | |||
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
.i; | |||
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
d21.r * z__2.i + d21.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L80: */ | |||
} | |||
} | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + k * a_dim1; | |||
i__2 = k + 1 + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + (k + 1) * a_dim1; | |||
i__2 = k + 1 + (k + 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = *n - k; | |||
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = *n - k - 1; | |||
zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
--j; | |||
} | |||
--j; | |||
if (jp != jj && j >= 1) { | |||
zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j > 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,236 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *, | |||
double *, const int *, int *, double *, const int *, int *); | |||
/** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zhetrf_rook. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html | |||
* */ | |||
void RELAPACK_zhetrf_rook( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
double *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * 2 * sizeof(double)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZHETRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy argument | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** zhetrf_rook's recursive compute kernel */ | |||
static void RELAPACK_zhetrf_rook_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_zhetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = ZREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
double *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
double *const A_BL_B = A + 2 * *n; | |||
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
double *const Work_BL = Work + 2 * n1; | |||
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
// last row of A_BL | |||
double *const A_BL_b = A_BL + 2 * n2_out; | |||
// last row of Work_BL | |||
double *const Work_BL_b = Work_BL + 2 * n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = ZREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
double *const A_TL_T = A + 2 * *ldA * n_rest; | |||
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
double *const Work_L = Work; | |||
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,662 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static doublecomplex c_b1 = {1.,0.}; | |||
static int c__1 = 1; | |||
/** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method | |||
* | |||
* This routine is a minor modification of LAPACK's zlahef_rook. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n, | |||
int *nb, int *kb, doublecomplex *a, int *lda, int * | |||
ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
double d__1, d__2; | |||
doublecomplex z__1, z__2, z__3, z__4, z__5; | |||
/* Builtin functions */ | |||
double sqrt(double), d_imag(doublecomplex *); | |||
void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *, | |||
doublecomplex *, doublecomplex *); | |||
/* Local variables */ | |||
static int j, k, p; | |||
static double t, r1; | |||
static doublecomplex d11, d21, d22; | |||
static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
static logical done; | |||
static int imax, jmax; | |||
static double alpha; | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
static double dtemp, sfmin; | |||
static int itemp, kstep; | |||
extern /* Subroutine */ int zgemv_(char *, int *, int *, | |||
doublecomplex *, doublecomplex *, int *, doublecomplex *, | |||
int *, doublecomplex *, doublecomplex *, int *, ftnlen), | |||
zcopy_(int *, doublecomplex *, int *, doublecomplex *, | |||
int *), zswap_(int *, doublecomplex *, int *, | |||
doublecomplex *, int *); | |||
extern double dlamch_(char *, ftnlen); | |||
static double absakk; | |||
extern /* Subroutine */ int zdscal_(int *, double *, | |||
doublecomplex *, int *); | |||
static double colmax; | |||
extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) | |||
; | |||
extern int izamax_(int *, doublecomplex *, int *); | |||
static double rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.) + 1.) / 8.; | |||
sfmin = dlamch_("S", (ftnlen)1); | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
kstep = 1; | |||
p = k; | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], & | |||
c__1); | |||
} | |||
i__1 = k + kw * w_dim1; | |||
i__2 = k + k * a_dim1; | |||
d__1 = a[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
i__1 = k + kw * w_dim1; | |||
i__2 = k + kw * w_dim1; | |||
d__1 = w[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
} | |||
i__1 = k + kw * w_dim1; | |||
absakk = (d__1 = w[i__1].r, abs(d__1)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = imax + kw * w_dim1; | |||
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
kw * w_dim1]), abs(d__2)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
d__1 = w[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
zcopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], | |||
&c__1); | |||
} | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L12: | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
} | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
i__2 = imax + imax * a_dim1; | |||
d__1 = a[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
i__1 = k - imax; | |||
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
i__1 = k - imax; | |||
zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
i__2 = imax + (kw - 1) * w_dim1; | |||
d__1 = w[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
} | |||
if (imax != k) { | |||
i__1 = k - imax; | |||
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
w_dim1], &c__1); | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& | |||
w[jmax + (kw - 1) * w_dim1]), abs(d__2)); | |||
} else { | |||
rowmax = 0.; | |||
} | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
i__1 = itemp + (kw - 1) * w_dim1; | |||
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
itemp + (kw - 1) * w_dim1]), abs(d__2)); | |||
if (dtemp > rowmax) { | |||
rowmax = dtemp; | |||
jmax = itemp; | |||
} | |||
} | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) { | |||
kp = imax; | |||
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} | |||
if (! done) { | |||
goto L12; | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kstep == 2 && p != k) { | |||
i__1 = p + p * a_dim1; | |||
i__2 = k + k * a_dim1; | |||
d__1 = a[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
i__1 = k - 1 - p; | |||
zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
a_dim1], lda); | |||
i__1 = k - 1 - p; | |||
zlacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda); | |||
if (p > 1) { | |||
i__1 = p - 1; | |||
zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + | |||
1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
zswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k + | |||
1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
ldw); | |||
} | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
d__1 = a[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
i__1 = kk - 1 - kp; | |||
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
i__1 = kk - 1 - kp; | |||
zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
if (kp > 1) { | |||
i__1 = kp - 1; | |||
zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
+ 1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
+ 1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
if (k > 1) { | |||
i__1 = k + k * a_dim1; | |||
t = a[i__1].r; | |||
if (abs(t) >= sfmin) { | |||
r1 = 1. / t; | |||
i__1 = k - 1; | |||
zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
i__1 = k - 1; | |||
for (ii = 1; ii <= i__1; ++ii) { | |||
i__2 = ii + k * a_dim1; | |||
i__3 = ii + k * a_dim1; | |||
z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L14: */ | |||
} | |||
} | |||
i__1 = k - 1; | |||
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
} | |||
} else { | |||
if (k > 2) { | |||
i__1 = k - 1 + kw * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
d_cnjg(&z__2, &d21); | |||
z_div(&z__1, &w[k + kw * w_dim1], &z__2); | |||
d11.r = z__1.r, d11.i = z__1.i; | |||
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
d22.r = z__1.r, d22.i = z__1.i; | |||
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
t = 1. / (z__1.r - 1.); | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
i__2 = j + (k - 1) * a_dim1; | |||
i__3 = j + (kw - 1) * w_dim1; | |||
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + kw * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
z_div(&z__2, &z__3, &d21); | |||
z__1.r = t * z__2.r, z__1.i = t * z__2.i; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + kw * w_dim1; | |||
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + (kw - 1) * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
d_cnjg(&z__5, &d21); | |||
z_div(&z__2, &z__3, &z__5); | |||
z__1.r = t * z__2.r, z__1.i = t * z__2.i; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L20: */ | |||
} | |||
} | |||
i__1 = k - 1 + (k - 1) * a_dim1; | |||
i__2 = k - 1 + (kw - 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1 + k * a_dim1; | |||
i__2 = k - 1 + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1; | |||
zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = k - 2; | |||
zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
++j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
++j; | |||
if (jp2 != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
++jj; | |||
if (kstep == 2 && jp1 != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
if (j < *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
kstep = 1; | |||
p = k; | |||
i__1 = k + k * w_dim1; | |||
i__2 = k + k * a_dim1; | |||
d__1 = a[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
w_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, & | |||
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
ftnlen)12); | |||
i__1 = k + k * w_dim1; | |||
i__2 = k + k * w_dim1; | |||
d__1 = w[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
} | |||
i__1 = k + k * w_dim1; | |||
absakk = (d__1 = w[i__1].r, abs(d__1)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = imax + k * w_dim1; | |||
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
k * w_dim1]), abs(d__2)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
d__1 = w[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
zcopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k * | |||
a_dim1], &c__1); | |||
} | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L72: | |||
i__1 = imax - k; | |||
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = imax - k; | |||
zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
i__1 = imax + (k + 1) * w_dim1; | |||
i__2 = imax + imax * a_dim1; | |||
d__1 = a[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
imax + 1 + (k + 1) * w_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1] | |||
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
1) * w_dim1], &c__1, (ftnlen)12); | |||
i__1 = imax + (k + 1) * w_dim1; | |||
i__2 = imax + (k + 1) * w_dim1; | |||
d__1 = w[i__2].r; | |||
w[i__1].r = d__1, w[i__1].i = 0.; | |||
} | |||
if (imax != k) { | |||
i__1 = imax - k; | |||
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
c__1); | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& | |||
w[jmax + (k + 1) * w_dim1]), abs(d__2)); | |||
} else { | |||
rowmax = 0.; | |||
} | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = itemp + (k + 1) * w_dim1; | |||
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
itemp + (k + 1) * w_dim1]), abs(d__2)); | |||
if (dtemp > rowmax) { | |||
rowmax = dtemp; | |||
jmax = itemp; | |||
} | |||
} | |||
i__1 = imax + (k + 1) * w_dim1; | |||
if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
} | |||
if (! done) { | |||
goto L72; | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kstep == 2 && p != k) { | |||
i__1 = p + p * a_dim1; | |||
i__2 = k + k * a_dim1; | |||
d__1 = a[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
i__1 = p - k - 1; | |||
zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) * | |||
a_dim1], lda); | |||
i__1 = p - k - 1; | |||
zlacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda); | |||
if (p < *n) { | |||
i__1 = *n - p; | |||
zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p | |||
* a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
zswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
} | |||
zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
} | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
d__1 = a[i__2].r; | |||
a[i__1].r = d__1, a[i__1].i = 0.; | |||
i__1 = kp - kk - 1; | |||
zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
1) * a_dim1], lda); | |||
i__1 = kp - kk - 1; | |||
zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
if (kp < *n) { | |||
i__1 = *n - kp; | |||
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
+ kp * a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
} | |||
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
i__1 = k + k * a_dim1; | |||
t = a[i__1].r; | |||
if (abs(t) >= sfmin) { | |||
r1 = 1. / t; | |||
i__1 = *n - k; | |||
zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} else { | |||
i__1 = *n; | |||
for (ii = k + 1; ii <= i__1; ++ii) { | |||
i__2 = ii + k * a_dim1; | |||
i__3 = ii + k * a_dim1; | |||
z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L74: */ | |||
} | |||
} | |||
i__1 = *n - k; | |||
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
i__1 = k + 1 + k * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
d11.r = z__1.r, d11.i = z__1.i; | |||
d_cnjg(&z__2, &d21); | |||
z_div(&z__1, &w[k + k * w_dim1], &z__2); | |||
d22.r = z__1.r, d22.i = z__1.i; | |||
z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
t = 1. / (z__1.r - 1.); | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + k * w_dim1; | |||
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + (k + 1) * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
d_cnjg(&z__5, &d21); | |||
z_div(&z__2, &z__3, &z__5); | |||
z__1.r = t * z__2.r, z__1.i = t * z__2.i; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
i__2 = j + (k + 1) * a_dim1; | |||
i__3 = j + (k + 1) * w_dim1; | |||
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + k * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
z_div(&z__2, &z__3, &d21); | |||
z__1.r = t * z__2.r, z__1.i = t * z__2.i; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L80: */ | |||
} | |||
} | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + k * a_dim1; | |||
i__2 = k + 1 + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + (k + 1) * a_dim1; | |||
i__2 = k + 1 + (k + 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = *n - k; | |||
zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = *n - k - 1; | |||
zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
--j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
--j; | |||
if (jp2 != jj && j >= 1) { | |||
zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
--jj; | |||
if (kstep == 2 && jp1 != jj && j >= 1) { | |||
zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j > 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,87 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_zlauum_rec(const char *, const int *, double *, | |||
const int *, int *); | |||
/** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zlauum. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html | |||
* */ | |||
void RELAPACK_zlauum( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZLAUUM", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Recursive kernel | |||
RELAPACK_zlauum_rec(&cleanuplo, n, A, ldA, info); | |||
} | |||
/** zlauum's recursive compute kernel */ | |||
static void RELAPACK_zlauum_rec( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) { | |||
// Unblocked | |||
LAPACK(zlauu2)(uplo, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
// Splitting | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + 2 * *ldA * n1; | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// recursion(A_TL) | |||
RELAPACK_zlauum_rec(uplo, &n1, A_TL, ldA, info); | |||
if (*uplo == 'L') { | |||
// A_TL = A_TL + A_BL' * A_BL | |||
BLAS(zherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
// A_BL = A_BR' * A_BL | |||
BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
} else { | |||
// A_TL = A_TL + A_TR * A_TR' | |||
BLAS(zherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
// A_TR = A_TR * A_BR' | |||
BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_zlauum_rec(uplo, &n2, A_BR, ldA, info); | |||
} |
@@ -0,0 +1,157 @@ | |||
#include "relapack.h" | |||
#include "stdlib.h" | |||
static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *, | |||
double *, const int *, double *, const int *, int *); | |||
/** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zpbtrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html | |||
* */ | |||
void RELAPACK_zpbtrf( | |||
const char *uplo, const int *n, const int *kd, | |||
double *Ab, const int *ldAb, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*kd < 0) | |||
*info = -3; | |||
else if (*ldAb < *kd + 1) | |||
*info = -5; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZPBTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Constant | |||
const double ZERO[] = { 0., 0. }; | |||
// Allocate work space | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
double *Work = malloc(mWork * nWork * 2 * sizeof(double)); | |||
LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
// Recursive kernel | |||
RELAPACK_zpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
// Free work space | |||
free(Work); | |||
} | |||
/** zpbtrf's recursive compute kernel */ | |||
static void RELAPACK_zpbtrf_rec( | |||
const char *uplo, const int *n, const int *kd, | |||
double *Ab, const int *ldAb, | |||
double *Work, const int *ldWork, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) { | |||
// Unblocked | |||
LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
// Unskew A | |||
const int ldA[] = { *ldAb - 1 }; | |||
double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); | |||
// Splitting | |||
const int n1 = MIN(ZREC_SPLIT(*n), *kd); | |||
const int n2 = *n - n1; | |||
// * * | |||
// * Ab_BR | |||
double *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + 2 * *ldA * n1; | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// recursion(A_TL) | |||
RELAPACK_zpotrf(uplo, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
// Banded splitting | |||
const int n21 = MIN(n2, *kd - n1); | |||
const int n22 = MIN(n2 - n21, *kd); | |||
// n1 n21 n22 | |||
// n1 * A_TRl A_TRr | |||
// n21 A_BLt A_BRtl A_BRtr | |||
// n22 A_BLb A_BRbl A_BRbr | |||
double *const A_TRl = A_TR; | |||
double *const A_TRr = A_TR + 2 * *ldA * n21; | |||
double *const A_BLt = A_BL; | |||
double *const A_BLb = A_BL + 2 * n21; | |||
double *const A_BRtl = A_BR; | |||
double *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
double *const A_BRbl = A_BR + 2 * n21; | |||
double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21; | |||
if (*uplo == 'L') { | |||
// A_BLt = ABLt / A_TL' | |||
BLAS(ztrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
// A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
BLAS(zherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
// Work = A_BLb | |||
LAPACK(zlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
// Work = Work / A_TL' | |||
BLAS(ztrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
// A_BRbl = A_BRbl - Work * A_BLt' | |||
BLAS(zgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
// A_BRbr = A_BRbr - Work * Work' | |||
BLAS(zherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
// A_BLb = Work | |||
LAPACK(zlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
} else { | |||
// A_TRl = A_TL' \ A_TRl | |||
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
// A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
BLAS(zherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
// Work = A_TRr | |||
LAPACK(zlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
// Work = A_TL' \ Work | |||
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
// A_BRtr = A_BRtr - A_TRl' * Work | |||
BLAS(zgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
// A_BRbr = A_BRbr - Work' * Work | |||
BLAS(zherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
// A_TRr = Work | |||
LAPACK(zlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
} | |||
// recursion(A_BR) | |||
if (*kd > n1) | |||
RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info); | |||
else | |||
RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,92 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_zpotrf_rec(const char *, const int *, double *, | |||
const int *, int *); | |||
/** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zpotrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html | |||
* */ | |||
void RELAPACK_zpotrf( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZPOTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Recursive kernel | |||
RELAPACK_zpotrf_rec(&cleanuplo, n, A, ldA, info); | |||
} | |||
/** zpotrf's recursive compute kernel */ | |||
static void RELAPACK_zpotrf_rec( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) { | |||
// Unblocked | |||
LAPACK(zpotf2)(uplo, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
// Splitting | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + 2 * *ldA * n1; | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// recursion(A_TL) | |||
RELAPACK_zpotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
if (*uplo == 'L') { | |||
// A_BL = A_BL / A_TL' | |||
BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
// A_BR = A_BR - A_BL * A_BL' | |||
BLAS(zherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
} else { | |||
// A_TR = A_TL' \ A_TR | |||
BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
// A_BR = A_BR - A_TR' * A_TR | |||
BLAS(zherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_zpotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,238 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *, | |||
double *, const int *, int *, double *, const int *, int *); | |||
/** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zsytrf. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html | |||
* */ | |||
void RELAPACK_zsytrf( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
double *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * 2 * sizeof(double)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZSYTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy arguments | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** zsytrf's recursive compute kernel */ | |||
static void RELAPACK_zsytrf_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_zsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
// Loop iterator | |||
int i; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = ZREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
double *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
double *const A_BL_B = A + 2 * *n; | |||
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
double *const Work_BL = Work + 2 * n1; | |||
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
// last row of A_BL | |||
double *const A_BL_b = A_BL + 2 * n2_out; | |||
// last row of Work_BL | |||
double *const Work_BL_b = Work_BL + 2 * n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = ZREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
double *const A_TL_T = A + 2 * *ldA * n_rest; | |||
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
double *const Work_L = Work; | |||
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,452 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static doublecomplex c_b1 = {1.,0.}; | |||
static int c__1 = 1; | |||
/** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. | |||
* | |||
* This routine is a minor modification of LAPACK's zlasyf. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int * | |||
nb, int *kb, doublecomplex *a, int *lda, int *ipiv, | |||
doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
double d__1, d__2, d__3, d__4; | |||
doublecomplex z__1, z__2, z__3; | |||
/* Builtin functions */ | |||
double sqrt(double), d_imag(doublecomplex *); | |||
void z_div(doublecomplex *, doublecomplex *, doublecomplex *); | |||
/* Local variables */ | |||
static int j, k; | |||
static doublecomplex t, r1, d11, d21, d22; | |||
static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
static double alpha; | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
extern /* Subroutine */ int zscal_(int *, doublecomplex *, | |||
doublecomplex *, int *); | |||
static int kstep; | |||
extern /* Subroutine */ int zgemv_(char *, int *, int *, | |||
doublecomplex *, doublecomplex *, int *, doublecomplex *, | |||
int *, doublecomplex *, doublecomplex *, int *, ftnlen), | |||
zcopy_(int *, doublecomplex *, int *, doublecomplex *, | |||
int *), zswap_(int *, doublecomplex *, int *, | |||
doublecomplex *, int *); | |||
static double absakk, colmax; | |||
extern int izamax_(int *, doublecomplex *, int *); | |||
static double rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.) + 1.) / 8.; | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
} | |||
kstep = 1; | |||
i__1 = k + kw * w_dim1; | |||
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw * | |||
w_dim1]), abs(d__2)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = imax + kw * w_dim1; | |||
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
kw * w_dim1]), abs(d__2)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = k - imax; | |||
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
i__1 = k - imax; | |||
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
&c__1); | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
jmax + (kw - 1) * w_dim1]), abs(d__2)); | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
/* Computing MAX */ | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( | |||
d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs( | |||
d__2)); | |||
rowmax = max(d__3,d__4); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else /* if(complicated condition) */ { | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
imax + (kw - 1) * w_dim1]), abs(d__2)) >= alpha * | |||
rowmax) { | |||
kp = imax; | |||
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
i__1 = kk - 1 - kp; | |||
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
if (kp > 1) { | |||
i__1 = kp - 1; | |||
zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
+ 1], &c__1); | |||
} | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
+ 1) * a_dim1], lda); | |||
} | |||
i__1 = *n - kk + 1; | |||
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
z_div(&z__1, &c_b1, &a[k + k * a_dim1]); | |||
r1.r = z__1.r, r1.i = z__1.i; | |||
i__1 = k - 1; | |||
zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
if (k > 2) { | |||
i__1 = k - 1 + kw * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
z_div(&z__1, &w[k + kw * w_dim1], &d21); | |||
d11.r = z__1.r, d11.i = z__1.i; | |||
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
d22.r = z__1.r, d22.i = z__1.i; | |||
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; | |||
z_div(&z__1, &c_b1, &z__2); | |||
t.r = z__1.r, t.i = z__1.i; | |||
z_div(&z__1, &t, &d21); | |||
d21.r = z__1.r, d21.i = z__1.i; | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
i__2 = j + (k - 1) * a_dim1; | |||
i__3 = j + (kw - 1) * w_dim1; | |||
z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + kw * w_dim1; | |||
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
.i; | |||
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
d21.r * z__2.i + d21.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + kw * w_dim1; | |||
z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + (kw - 1) * w_dim1; | |||
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
.i; | |||
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
d21.r * z__2.i + d21.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L20: */ | |||
} | |||
} | |||
i__1 = k - 1 + (k - 1) * a_dim1; | |||
i__2 = k - 1 + (kw - 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1 + k * a_dim1; | |||
i__2 = k - 1 + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
++j; | |||
} | |||
++j; | |||
if (jp != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
} | |||
if (j < *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k | |||
+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
kstep = 1; | |||
i__1 = k + k * w_dim1; | |||
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k * | |||
w_dim1]), abs(d__2)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = imax + k * w_dim1; | |||
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
k * w_dim1]), abs(d__2)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
} else { | |||
if (absakk >= alpha * colmax) { | |||
kp = k; | |||
} else { | |||
i__1 = imax - k; | |||
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = *n - imax + 1; | |||
zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
1) * w_dim1], &c__1); | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], | |||
lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
w_dim1], &c__1, (ftnlen)12); | |||
i__1 = imax - k; | |||
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
; | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
jmax + (k + 1) * w_dim1]), abs(d__2)); | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
/* Computing MAX */ | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( | |||
d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs( | |||
d__2)); | |||
rowmax = max(d__3,d__4); | |||
} | |||
if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
kp = k; | |||
} else /* if(complicated condition) */ { | |||
i__1 = imax + (k + 1) * w_dim1; | |||
if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
imax + (k + 1) * w_dim1]), abs(d__2)) >= alpha * | |||
rowmax) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
k * w_dim1], &c__1); | |||
} else { | |||
kp = imax; | |||
kstep = 2; | |||
} | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kp != kk) { | |||
i__1 = kp + kp * a_dim1; | |||
i__2 = kk + kk * a_dim1; | |||
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
i__1 = kp - kk - 1; | |||
zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
1) * a_dim1], lda); | |||
if (kp < *n) { | |||
i__1 = *n - kp; | |||
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
+ kp * a_dim1], &c__1); | |||
} | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
} | |||
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
z_div(&z__1, &c_b1, &a[k + k * a_dim1]); | |||
r1.r = z__1.r, r1.i = z__1.i; | |||
i__1 = *n - k; | |||
zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
i__1 = k + 1 + k * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
d11.r = z__1.r, d11.i = z__1.i; | |||
z_div(&z__1, &w[k + k * w_dim1], &d21); | |||
d22.r = z__1.r, d22.i = z__1.i; | |||
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; | |||
z_div(&z__1, &c_b1, &z__2); | |||
t.r = z__1.r, t.i = z__1.i; | |||
z_div(&z__1, &t, &d21); | |||
d21.r = z__1.r, d21.i = z__1.i; | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + k * w_dim1; | |||
z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + (k + 1) * w_dim1; | |||
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
.i; | |||
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
d21.r * z__2.i + d21.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
i__2 = j + (k + 1) * a_dim1; | |||
i__3 = j + (k + 1) * w_dim1; | |||
z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + k * w_dim1; | |||
z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
.i; | |||
z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
d21.r * z__2.i + d21.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L80: */ | |||
} | |||
} | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + k * a_dim1; | |||
i__2 = k + 1 + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + (k + 1) * a_dim1; | |||
i__2 = k + 1 + (k + 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -kp; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
jj = j; | |||
jp = ipiv[j]; | |||
if (jp < 0) { | |||
jp = -jp; | |||
--j; | |||
} | |||
--j; | |||
if (jp != jj && j >= 1) { | |||
zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j > 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,236 @@ | |||
#include "relapack.h" | |||
#if XSYTRF_ALLOW_MALLOC | |||
#include <stdlib.h> | |||
#endif | |||
static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *, | |||
double *, const int *, int *, double *, const int *, int *); | |||
/** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
* | |||
* This routine is functionally equivalent to LAPACK's zsytrf_rook. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html | |||
* */ | |||
void RELAPACK_zsytrf_rook( | |||
const char *uplo, const int *n, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *lWork, int *info | |||
) { | |||
// Required work size | |||
const int cleanlWork = *n * (*n / 2); | |||
int minlWork = cleanlWork; | |||
#if XSYTRF_ALLOW_MALLOC | |||
minlWork = 1; | |||
#endif | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (*n < 0) | |||
*info = -2; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -4; | |||
else if (*lWork < minlWork && *lWork != -1) | |||
*info = -7; | |||
else if (*lWork == -1) { | |||
// Work size query | |||
*Work = cleanlWork; | |||
return; | |||
} | |||
// Ensure Work size | |||
double *cleanWork = Work; | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (!*info && *lWork < cleanlWork) { | |||
cleanWork = malloc(cleanlWork * 2 * sizeof(double)); | |||
if (!cleanWork) | |||
*info = -7; | |||
} | |||
#endif | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZSYTRF", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
// Dummy argument | |||
int nout; | |||
// Recursive kernel | |||
RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
#if XSYTRF_ALLOW_MALLOC | |||
if (cleanWork != Work) | |||
free(cleanWork); | |||
#endif | |||
} | |||
/** zsytrf_rook's recursive compute kernel */ | |||
static void RELAPACK_zsytrf_rook_rec( | |||
const char *uplo, const int *n_full, const int *n, int *n_out, | |||
double *A, const int *ldA, int *ipiv, | |||
double *Work, const int *ldWork, int *info | |||
) { | |||
// top recursion level? | |||
const int top = *n_full == *n; | |||
if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) { | |||
// Unblocked | |||
if (top) { | |||
LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info); | |||
*n_out = *n; | |||
} else | |||
RELAPACK_zsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
return; | |||
} | |||
int info1, info2; | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
const int n_rest = *n_full - *n; | |||
if (*uplo == 'L') { | |||
// Splitting (setup) | |||
int n1 = ZREC_SPLIT(*n); | |||
int n2 = *n - n1; | |||
// Work_L * | |||
double *const Work_L = Work; | |||
// recursion(A_L) | |||
int n1_out; | |||
RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
n1 = n1_out; | |||
// Splitting (continued) | |||
n2 = *n - n1; | |||
const int n_full2 = *n_full - n1; | |||
// * * | |||
// A_BL A_BR | |||
// A_BL_B A_BR_B | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
double *const A_BL_B = A + 2 * *n; | |||
double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
// * * | |||
// Work_BL Work_BR | |||
// * * | |||
// (top recursion level: use Work as Work_BR) | |||
double *const Work_BL = Work + 2 * n1; | |||
double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
const int ldWork_BR = top ? n2 : *ldWork; | |||
// ipiv_T | |||
// ipiv_B | |||
int *const ipiv_B = ipiv + n1; | |||
// A_BR = A_BR - A_BL Work_BL' | |||
RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
// recursion(A_BR) | |||
int n2_out; | |||
RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
if (n2_out != n2) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// last column of A_BR | |||
double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
// last row of A_BL | |||
double *const A_BL_b = A_BL + 2 * n2_out; | |||
// last row of Work_BL | |||
double *const Work_BL_b = Work_BL + 2 * n2_out; | |||
// A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
} | |||
n2 = n2_out; | |||
// shift pivots | |||
int i; | |||
for (i = 0; i < n2; i++) | |||
if (ipiv_B[i] > 0) | |||
ipiv_B[i] += n1; | |||
else | |||
ipiv_B[i] -= n1; | |||
*info = info1 || info2; | |||
*n_out = n1 + n2; | |||
} else { | |||
// Splitting (setup) | |||
int n2 = ZREC_SPLIT(*n); | |||
int n1 = *n - n2; | |||
// * Work_R | |||
// (top recursion level: use Work as Work_R) | |||
double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
// recursion(A_R) | |||
int n2_out; | |||
RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
const int n2_diff = n2 - n2_out; | |||
n2 = n2_out; | |||
// Splitting (continued) | |||
n1 = *n - n2; | |||
const int n_full1 = *n_full - n2; | |||
// * A_TL_T A_TR_T | |||
// * A_TL A_TR | |||
// * * * | |||
double *const A_TL_T = A + 2 * *ldA * n_rest; | |||
double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
// Work_L * | |||
// * Work_TR | |||
// * * | |||
// (top recursion level: Work_R was Work) | |||
double *const Work_L = Work; | |||
double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
const int ldWork_L = top ? n1 : *ldWork; | |||
// A_TL = A_TL - A_TR Work_TR' | |||
RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
// recursion(A_TL) | |||
int n1_out; | |||
RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
if (n1_out != n1) { | |||
// undo 1 column of updates | |||
const int n_restp1 = n_rest + 1; | |||
// A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
} | |||
n1 = n1_out; | |||
*info = info2 || info1; | |||
*n_out = n1 + n2; | |||
} | |||
} |
@@ -0,0 +1,561 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "f2c.h" | |||
/* Table of constant values */ | |||
static doublecomplex c_b1 = {1.,0.}; | |||
static int c__1 = 1; | |||
/** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. | |||
* | |||
* This routine is a minor modification of LAPACK's zlasyf_rook. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* The blocked BLAS Level 3 updates were removed and moved to the | |||
* recursive algorithm. | |||
* */ | |||
/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n, | |||
int *nb, int *kb, doublecomplex *a, int *lda, int * | |||
ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
double d__1, d__2; | |||
doublecomplex z__1, z__2, z__3, z__4; | |||
/* Builtin functions */ | |||
double sqrt(double), d_imag(doublecomplex *); | |||
void z_div(doublecomplex *, doublecomplex *, doublecomplex *); | |||
/* Local variables */ | |||
static int j, k, p; | |||
static doublecomplex t, r1, d11, d12, d21, d22; | |||
static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
static logical done; | |||
static int imax, jmax; | |||
static double alpha; | |||
extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
static double dtemp, sfmin; | |||
extern /* Subroutine */ int zscal_(int *, doublecomplex *, | |||
doublecomplex *, int *); | |||
static int itemp, kstep; | |||
extern /* Subroutine */ int zgemv_(char *, int *, int *, | |||
doublecomplex *, doublecomplex *, int *, doublecomplex *, | |||
int *, doublecomplex *, doublecomplex *, int *, ftnlen), | |||
zcopy_(int *, doublecomplex *, int *, doublecomplex *, | |||
int *), zswap_(int *, doublecomplex *, int *, | |||
doublecomplex *, int *); | |||
extern double dlamch_(char *, ftnlen); | |||
static double absakk, colmax; | |||
extern int izamax_(int *, doublecomplex *, int *); | |||
static double rowmax; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
--ipiv; | |||
w_dim1 = *ldw; | |||
w_offset = 1 + w_dim1; | |||
w -= w_offset; | |||
/* Function Body */ | |||
*info = 0; | |||
alpha = (sqrt(17.) + 1.) / 8.; | |||
sfmin = dlamch_("S", (ftnlen)1); | |||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
k = *n; | |||
L10: | |||
kw = *nb + k - *n; | |||
if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
goto L30; | |||
} | |||
kstep = 1; | |||
p = k; | |||
zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], | |||
lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
w_dim1 + 1], &c__1, (ftnlen)12); | |||
} | |||
i__1 = k + kw * w_dim1; | |||
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw * | |||
w_dim1]), abs(d__2)); | |||
if (k > 1) { | |||
i__1 = k - 1; | |||
imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
i__1 = imax + kw * w_dim1; | |||
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
kw * w_dim1]), abs(d__2)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L12: | |||
zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
w_dim1 + 1], &c__1); | |||
i__1 = k - imax; | |||
zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
1 + (kw - 1) * w_dim1], &c__1); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * | |||
a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
if (imax != k) { | |||
i__1 = k - imax; | |||
jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
w_dim1], &c__1); | |||
i__1 = jmax + (kw - 1) * w_dim1; | |||
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& | |||
w[jmax + (kw - 1) * w_dim1]), abs(d__2)); | |||
} else { | |||
rowmax = 0.; | |||
} | |||
if (imax > 1) { | |||
i__1 = imax - 1; | |||
itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
i__1 = itemp + (kw - 1) * w_dim1; | |||
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
itemp + (kw - 1) * w_dim1]), abs(d__2)); | |||
if (dtemp > rowmax) { | |||
rowmax = dtemp; | |||
jmax = itemp; | |||
} | |||
} | |||
i__1 = imax + (kw - 1) * w_dim1; | |||
if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax | |||
+ (kw - 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) { | |||
kp = imax; | |||
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
w_dim1 + 1], &c__1); | |||
} | |||
if (! done) { | |||
goto L12; | |||
} | |||
} | |||
kk = k - kstep + 1; | |||
kkw = *nb + kk - *n; | |||
if (kstep == 2 && p != k) { | |||
i__1 = k - p; | |||
zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
a_dim1], lda); | |||
zcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & | |||
c__1); | |||
i__1 = *n - k + 1; | |||
zswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], | |||
lda); | |||
i__1 = *n - kk + 1; | |||
zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
ldw); | |||
} | |||
if (kp != kk) { | |||
i__1 = kp + k * a_dim1; | |||
i__2 = kk + k * a_dim1; | |||
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
i__1 = k - 1 - kp; | |||
zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
1) * a_dim1], lda); | |||
zcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & | |||
c__1); | |||
i__1 = *n - kk + 1; | |||
zswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], | |||
lda); | |||
i__1 = *n - kk + 1; | |||
zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
c__1); | |||
if (k > 1) { | |||
i__1 = k + k * a_dim1; | |||
if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k + | |||
k * a_dim1]), abs(d__2)) >= sfmin) { | |||
z_div(&z__1, &c_b1, &a[k + k * a_dim1]); | |||
r1.r = z__1.r, r1.i = z__1.i; | |||
i__1 = k - 1; | |||
zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
} else /* if(complicated condition) */ { | |||
i__1 = k + k * a_dim1; | |||
if (a[i__1].r != 0. || a[i__1].i != 0.) { | |||
i__1 = k - 1; | |||
for (ii = 1; ii <= i__1; ++ii) { | |||
i__2 = ii + k * a_dim1; | |||
z_div(&z__1, &a[ii + k * a_dim1], &a[k + k * | |||
a_dim1]); | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L14: */ | |||
} | |||
} | |||
} | |||
} | |||
} else { | |||
if (k > 2) { | |||
i__1 = k - 1 + kw * w_dim1; | |||
d12.r = w[i__1].r, d12.i = w[i__1].i; | |||
z_div(&z__1, &w[k + kw * w_dim1], &d12); | |||
d11.r = z__1.r, d11.i = z__1.i; | |||
z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d12); | |||
d22.r = z__1.r, d22.i = z__1.i; | |||
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; | |||
z_div(&z__1, &c_b1, &z__2); | |||
t.r = z__1.r, t.i = z__1.i; | |||
i__1 = k - 2; | |||
for (j = 1; j <= i__1; ++j) { | |||
i__2 = j + (k - 1) * a_dim1; | |||
i__3 = j + (kw - 1) * w_dim1; | |||
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + kw * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
z_div(&z__2, &z__3, &d12); | |||
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * | |||
z__2.i + t.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + kw * w_dim1; | |||
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + (kw - 1) * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
z_div(&z__2, &z__3, &d12); | |||
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * | |||
z__2.i + t.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L20: */ | |||
} | |||
} | |||
i__1 = k - 1 + (k - 1) * a_dim1; | |||
i__2 = k - 1 + (kw - 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k - 1 + k * a_dim1; | |||
i__2 = k - 1 + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + kw * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k - 1] = -kp; | |||
} | |||
k -= kstep; | |||
goto L10; | |||
L30: | |||
j = k + 1; | |||
L60: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
++j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
++j; | |||
if (jp2 != jj && j <= *n) { | |||
i__1 = *n - j + 1; | |||
zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
jj = j - 1; | |||
if (jp1 != jj && kstep == 2) { | |||
i__1 = *n - j + 1; | |||
zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
; | |||
} | |||
if (j <= *n) { | |||
goto L60; | |||
} | |||
*kb = *n - k; | |||
} else { | |||
k = 1; | |||
L70: | |||
if ((k >= *nb && *nb < *n) || k > *n) { | |||
goto L90; | |||
} | |||
kstep = 1; | |||
p = k; | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, & | |||
w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
ftnlen)12); | |||
} | |||
i__1 = k + k * w_dim1; | |||
absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k * | |||
w_dim1]), abs(d__2)); | |||
if (k < *n) { | |||
i__1 = *n - k; | |||
imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
i__1 = imax + k * w_dim1; | |||
colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
k * w_dim1]), abs(d__2)); | |||
} else { | |||
colmax = 0.; | |||
} | |||
if (max(absakk,colmax) == 0.) { | |||
if (*info == 0) { | |||
*info = k; | |||
} | |||
kp = k; | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
} else { | |||
if (! (absakk < alpha * colmax)) { | |||
kp = k; | |||
} else { | |||
done = FALSE_; | |||
L72: | |||
i__1 = imax - k; | |||
zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = *n - imax + 1; | |||
zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
1) * w_dim1], &c__1); | |||
if (k > 1) { | |||
i__1 = *n - k + 1; | |||
i__2 = k - 1; | |||
z__1.r = -1., z__1.i = -0.; | |||
zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1] | |||
, lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
1) * w_dim1], &c__1, (ftnlen)12); | |||
} | |||
if (imax != k) { | |||
i__1 = imax - k; | |||
jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
c__1); | |||
i__1 = jmax + (k + 1) * w_dim1; | |||
rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& | |||
w[jmax + (k + 1) * w_dim1]), abs(d__2)); | |||
} else { | |||
rowmax = 0.; | |||
} | |||
if (imax < *n) { | |||
i__1 = *n - imax; | |||
itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
w_dim1], &c__1); | |||
i__1 = itemp + (k + 1) * w_dim1; | |||
dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
itemp + (k + 1) * w_dim1]), abs(d__2)); | |||
if (dtemp > rowmax) { | |||
rowmax = dtemp; | |||
jmax = itemp; | |||
} | |||
} | |||
i__1 = imax + (k + 1) * w_dim1; | |||
if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax | |||
+ (k + 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) { | |||
kp = imax; | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
done = TRUE_; | |||
} else if (p == jmax || rowmax <= colmax) { | |||
kp = imax; | |||
kstep = 2; | |||
done = TRUE_; | |||
} else { | |||
p = imax; | |||
colmax = rowmax; | |||
imax = jmax; | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
w_dim1], &c__1); | |||
} | |||
if (! done) { | |||
goto L72; | |||
} | |||
} | |||
kk = k + kstep - 1; | |||
if (kstep == 2 && p != k) { | |||
i__1 = p - k; | |||
zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], | |||
lda); | |||
i__1 = *n - p + 1; | |||
zcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & | |||
c__1); | |||
zswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
} | |||
if (kp != kk) { | |||
i__1 = kp + k * a_dim1; | |||
i__2 = kk + k * a_dim1; | |||
a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
i__1 = kp - k - 1; | |||
zcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) | |||
* a_dim1], lda); | |||
i__1 = *n - kp + 1; | |||
zcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * | |||
a_dim1], &c__1); | |||
zswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
} | |||
if (kstep == 1) { | |||
i__1 = *n - k + 1; | |||
zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
c__1); | |||
if (k < *n) { | |||
i__1 = k + k * a_dim1; | |||
if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k + | |||
k * a_dim1]), abs(d__2)) >= sfmin) { | |||
z_div(&z__1, &c_b1, &a[k + k * a_dim1]); | |||
r1.r = z__1.r, r1.i = z__1.i; | |||
i__1 = *n - k; | |||
zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
} else /* if(complicated condition) */ { | |||
i__1 = k + k * a_dim1; | |||
if (a[i__1].r != 0. || a[i__1].i != 0.) { | |||
i__1 = *n; | |||
for (ii = k + 1; ii <= i__1; ++ii) { | |||
i__2 = ii + k * a_dim1; | |||
z_div(&z__1, &a[ii + k * a_dim1], &a[k + k * | |||
a_dim1]); | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L74: */ | |||
} | |||
} | |||
} | |||
} | |||
} else { | |||
if (k < *n - 1) { | |||
i__1 = k + 1 + k * w_dim1; | |||
d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
d11.r = z__1.r, d11.i = z__1.i; | |||
z_div(&z__1, &w[k + k * w_dim1], &d21); | |||
d22.r = z__1.r, d22.i = z__1.i; | |||
z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * | |||
d22.i + d11.i * d22.r; | |||
z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; | |||
z_div(&z__1, &c_b1, &z__2); | |||
t.r = z__1.r, t.i = z__1.i; | |||
i__1 = *n; | |||
for (j = k + 2; j <= i__1; ++j) { | |||
i__2 = j + k * a_dim1; | |||
i__3 = j + k * w_dim1; | |||
z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
.r; | |||
i__4 = j + (k + 1) * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
z_div(&z__2, &z__3, &d21); | |||
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * | |||
z__2.i + t.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
i__2 = j + (k + 1) * a_dim1; | |||
i__3 = j + (k + 1) * w_dim1; | |||
z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
.r; | |||
i__4 = j + k * w_dim1; | |||
z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
.i; | |||
z_div(&z__2, &z__3, &d21); | |||
z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * | |||
z__2.i + t.i * z__2.r; | |||
a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
/* L80: */ | |||
} | |||
} | |||
i__1 = k + k * a_dim1; | |||
i__2 = k + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + k * a_dim1; | |||
i__2 = k + 1 + k * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
i__1 = k + 1 + (k + 1) * a_dim1; | |||
i__2 = k + 1 + (k + 1) * w_dim1; | |||
a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
} | |||
} | |||
if (kstep == 1) { | |||
ipiv[k] = kp; | |||
} else { | |||
ipiv[k] = -p; | |||
ipiv[k + 1] = -kp; | |||
} | |||
k += kstep; | |||
goto L70; | |||
L90: | |||
j = k - 1; | |||
L120: | |||
kstep = 1; | |||
jp1 = 1; | |||
jj = j; | |||
jp2 = ipiv[j]; | |||
if (jp2 < 0) { | |||
jp2 = -jp2; | |||
--j; | |||
jp1 = -ipiv[j]; | |||
kstep = 2; | |||
} | |||
--j; | |||
if (jp2 != jj && j >= 1) { | |||
zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
jj = j + 1; | |||
if (jp1 != jj && kstep == 2) { | |||
zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
} | |||
if (j >= 1) { | |||
goto L120; | |||
} | |||
*kb = k - 1; | |||
} | |||
return; | |||
} |
@@ -0,0 +1,268 @@ | |||
#include "relapack.h" | |||
#include <math.h> | |||
static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *, | |||
const int *, const double *, const int *, const double *, const int *, | |||
double *, const int *, const double *, const int *, const double *, | |||
const int *, double *, const int *, double *, double *, double *, int *); | |||
/** ZTGSYL solves the generalized Sylvester equation. | |||
* | |||
* This routine is functionally equivalent to LAPACK's ztgsyl. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html | |||
* */ | |||
void RELAPACK_ztgsyl( | |||
const char *trans, const int *ijob, const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, | |||
const double *D, const int *ldD, const double *E, const int *ldE, | |||
double *F, const int *ldF, | |||
double *scale, double *dif, | |||
double *Work, const int *lWork, int *iWork, int *info | |||
) { | |||
// Parse arguments | |||
const int notran = LAPACK(lsame)(trans, "N"); | |||
const int tran = LAPACK(lsame)(trans, "C"); | |||
// Compute work buffer size | |||
int lwmin = 1; | |||
if (notran && (*ijob == 1 || *ijob == 2)) | |||
lwmin = MAX(1, 2 * *m * *n); | |||
*info = 0; | |||
// Check arguments | |||
if (!tran && !notran) | |||
*info = -1; | |||
else if (notran && (*ijob < 0 || *ijob > 4)) | |||
*info = -2; | |||
else if (*m <= 0) | |||
*info = -3; | |||
else if (*n <= 0) | |||
*info = -4; | |||
else if (*ldA < MAX(1, *m)) | |||
*info = -6; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -8; | |||
else if (*ldC < MAX(1, *m)) | |||
*info = -10; | |||
else if (*ldD < MAX(1, *m)) | |||
*info = -12; | |||
else if (*ldE < MAX(1, *n)) | |||
*info = -14; | |||
else if (*ldF < MAX(1, *m)) | |||
*info = -16; | |||
else if (*lWork < lwmin && *lWork != -1) | |||
*info = -20; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZTGSYL", &minfo); | |||
return; | |||
} | |||
if (*lWork == -1) { | |||
// Work size query | |||
*Work = lwmin; | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleantrans = notran ? 'N' : 'C'; | |||
// Constant | |||
const double ZERO[] = { 0., 0. }; | |||
int isolve = 1; | |||
int ifunc = 0; | |||
if (notran) { | |||
if (*ijob >= 3) { | |||
ifunc = *ijob - 2; | |||
LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
} else if (*ijob >= 1) | |||
isolve = 2; | |||
} | |||
double scale2; | |||
int iround; | |||
for (iround = 1; iround <= isolve; iround++) { | |||
*scale = 1; | |||
double dscale = 0; | |||
double dsum = 1; | |||
RELAPACK_ztgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info); | |||
if (dscale != 0) { | |||
if (*ijob == 1 || *ijob == 3) | |||
*dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); | |||
else | |||
*dif = sqrt(*m * *n) / (dscale * sqrt(dsum)); | |||
} | |||
if (isolve == 2) { | |||
if (iround == 1) { | |||
if (notran) | |||
ifunc = *ijob; | |||
scale2 = *scale; | |||
LAPACK(zlacpy)("F", m, n, C, ldC, Work, m); | |||
LAPACK(zlacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m); | |||
LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
} else { | |||
LAPACK(zlacpy)("F", m, n, Work, m, C, ldC); | |||
LAPACK(zlacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF); | |||
*scale = scale2; | |||
} | |||
} | |||
} | |||
} | |||
/** ztgsyl's recursive vompute kernel */ | |||
static void RELAPACK_ztgsyl_rec( | |||
const char *trans, const int *ifunc, const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, | |||
const double *D, const int *ldD, const double *E, const int *ldE, | |||
double *F, const int *ldF, | |||
double *scale, double *dsum, double *dscale, | |||
int *info | |||
) { | |||
if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) { | |||
// Unblocked | |||
LAPACK(ztgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
const int iONE[] = { 1 }; | |||
// Outputs | |||
double scale1[] = { 1., 0. }; | |||
double scale2[] = { 1., 0. }; | |||
int info1[] = { 0 }; | |||
int info2[] = { 0 }; | |||
if (*m > *n) { | |||
// Splitting | |||
const int m1 = ZREC_SPLIT(*m); | |||
const int m2 = *m - m1; | |||
// A_TL A_TR | |||
// 0 A_BR | |||
const double *const A_TL = A; | |||
const double *const A_TR = A + 2 * *ldA * m1; | |||
const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
// C_T | |||
// C_B | |||
double *const C_T = C; | |||
double *const C_B = C + 2 * m1; | |||
// D_TL D_TR | |||
// 0 D_BR | |||
const double *const D_TL = D; | |||
const double *const D_TR = D + 2 * *ldD * m1; | |||
const double *const D_BR = D + 2 * *ldD * m1 + 2 * m1; | |||
// F_T | |||
// F_B | |||
double *const F_T = F; | |||
double *const F_B = F + 2 * m1; | |||
if (*trans == 'N') { | |||
// recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1); | |||
// C_T = C_T - A_TR * C_B | |||
BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
// F_T = F_T - D_TR * C_B | |||
BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); | |||
// recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); | |||
} | |||
} else { | |||
// recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1); | |||
// apply scale | |||
if (scale1[0] != 1) | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); | |||
// C_B = C_B - A_TR^H * C_T | |||
BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
// C_B = C_B - D_TR^H * F_T | |||
BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); | |||
// recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); | |||
} | |||
} | |||
} else { | |||
// Splitting | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// B_TL B_TR | |||
// 0 B_BR | |||
const double *const B_TL = B; | |||
const double *const B_TR = B + 2 * *ldB * n1; | |||
const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
// C_L C_R | |||
double *const C_L = C; | |||
double *const C_R = C + 2 * *ldC * n1; | |||
// E_TL E_TR | |||
// 0 E_BR | |||
const double *const E_TL = E; | |||
const double *const E_TR = E + 2 * *ldE * n1; | |||
const double *const E_BR = E + 2 * *ldE * n1 + 2 * n1; | |||
// F_L F_R | |||
double *const F_L = F; | |||
double *const F_R = F + 2 * *ldF * n1; | |||
if (*trans == 'N') { | |||
// recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1); | |||
// C_R = C_R + F_L * B_TR | |||
BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); | |||
// F_R = F_R + F_L * E_TR | |||
BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); | |||
// recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); | |||
} | |||
} else { | |||
// recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1); | |||
// apply scale | |||
if (scale1[0] != 1) | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); | |||
// F_L = F_L + C_R * B_TR | |||
BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); | |||
// F_L = F_L + F_R * E_TR | |||
BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); | |||
// recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2); | |||
// apply scale | |||
if (scale2[0] != 1) { | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); | |||
} | |||
} | |||
} | |||
*scale = scale1[0] * scale2[0]; | |||
*info = info1[0] || info2[0]; | |||
} |
@@ -0,0 +1,163 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *, | |||
const int *, const int *, const double *, const int *, const double *, | |||
const int *, double *, const int *, double *, int *); | |||
/** ZTRSYL solves the complex Sylvester matrix equation. | |||
* | |||
* This routine is functionally equivalent to LAPACK's ztrsyl. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html | |||
* */ | |||
void RELAPACK_ztrsyl( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, double *scale, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int notransA = LAPACK(lsame)(tranA, "N"); | |||
const int ctransA = LAPACK(lsame)(tranA, "C"); | |||
const int notransB = LAPACK(lsame)(tranB, "N"); | |||
const int ctransB = LAPACK(lsame)(tranB, "C"); | |||
*info = 0; | |||
if (!ctransA && !notransA) | |||
*info = -1; | |||
else if (!ctransB && !notransB) | |||
*info = -2; | |||
else if (*isgn != 1 && *isgn != -1) | |||
*info = -3; | |||
else if (*m < 0) | |||
*info = -4; | |||
else if (*n < 0) | |||
*info = -5; | |||
else if (*ldA < MAX(1, *m)) | |||
*info = -7; | |||
else if (*ldB < MAX(1, *n)) | |||
*info = -9; | |||
else if (*ldC < MAX(1, *m)) | |||
*info = -11; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZTRSYL", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleantranA = notransA ? 'N' : 'C'; | |||
const char cleantranB = notransB ? 'N' : 'C'; | |||
// Recursive kernel | |||
RELAPACK_ztrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
} | |||
/** ztrsyl's recursive compute kernel */ | |||
static void RELAPACK_ztrsyl_rec( | |||
const char *tranA, const char *tranB, const int *isgn, | |||
const int *m, const int *n, | |||
const double *A, const int *ldA, const double *B, const int *ldB, | |||
double *C, const int *ldC, double *scale, | |||
int *info | |||
) { | |||
if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) { | |||
// Unblocked | |||
RELAPACK_ztrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1., 0. }; | |||
const double MONE[] = { -1., 0. }; | |||
const double MSGN[] = { -*isgn, 0. }; | |||
const int iONE[] = { 1 }; | |||
// Outputs | |||
double scale1[] = { 1., 0. }; | |||
double scale2[] = { 1., 0. }; | |||
int info1[] = { 0 }; | |||
int info2[] = { 0 }; | |||
if (*m > *n) { | |||
// Splitting | |||
const int m1 = ZREC_SPLIT(*m); | |||
const int m2 = *m - m1; | |||
// A_TL A_TR | |||
// 0 A_BR | |||
const double *const A_TL = A; | |||
const double *const A_TR = A + 2 * *ldA * m1; | |||
const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
// C_T | |||
// C_B | |||
double *const C_T = C; | |||
double *const C_B = C + 2 * m1; | |||
if (*tranA == 'N') { | |||
// recusion(A_BR, B, C_B) | |||
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); | |||
// C_T = C_T - A_TR * C_B | |||
BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
// recusion(A_TL, B, C_T) | |||
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
} else { | |||
// recusion(A_TL, B, C_T) | |||
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); | |||
// C_B = C_B - A_TR' * C_T | |||
BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
// recusion(A_BR, B, C_B) | |||
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); | |||
} | |||
} else { | |||
// Splitting | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// B_TL B_TR | |||
// 0 B_BR | |||
const double *const B_TL = B; | |||
const double *const B_TR = B + 2 * *ldB * n1; | |||
const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
// C_L C_R | |||
double *const C_L = C; | |||
double *const C_R = C + 2 * *ldC * n1; | |||
if (*tranB == 'N') { | |||
// recusion(A, B_TL, C_L) | |||
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); | |||
// C_R = C_R -/+ C_L * B_TR | |||
BLAS(zgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); | |||
// recusion(A, B_BR, C_R) | |||
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
} else { | |||
// recusion(A, B_BR, C_R) | |||
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); | |||
// C_L = C_L -/+ C_R * B_TR' | |||
BLAS(zgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); | |||
// recusion(A, B_TL, C_L) | |||
RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); | |||
// apply scale | |||
if (scale2[0] != 1) | |||
LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
} | |||
} | |||
*scale = scale1[0] * scale2[0]; | |||
*info = info1[0] || info2[0]; | |||
} |
@@ -0,0 +1,394 @@ | |||
/* -- translated by f2c (version 20100827). | |||
You must link the resulting object file with libf2c: | |||
on Microsoft Windows system, link with libf2c.lib; | |||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
-- in that order, at the end of the command line, as in | |||
cc *.o -lf2c -lm | |||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
http://www.netlib.org/f2c/libf2c.zip | |||
*/ | |||
#include "../config.h" | |||
#include "f2c.h" | |||
#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { | |||
extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); | |||
doublecomplex result; | |||
zdotu_(&result, n, x, incx, y, incy); | |||
return result; | |||
} | |||
#define zdotu_ zdotu_fun | |||
doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { | |||
extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); | |||
doublecomplex result; | |||
zdotc_(&result, n, x, incx, y, incy); | |||
return result; | |||
} | |||
#define zdotc_ zdotc_fun | |||
#endif | |||
#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) { | |||
extern void zladiv_(doublecomplex *, doublecomplex *, doublecomplex *); | |||
doublecomplex result; | |||
zladiv_(&result, a, b); | |||
return result; | |||
} | |||
#define zladiv_ zladiv_fun | |||
#endif | |||
/* Table of constant values */ | |||
static int c__1 = 1; | |||
/** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) | |||
* | |||
* This routine is an exact copy of LAPACK's ztrsyl. | |||
* It serves as an unblocked kernel in the recursive algorithms. | |||
* */ | |||
/* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int | |||
*isgn, int *m, int *n, doublecomplex *a, int *lda, | |||
doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc, | |||
double *scale, int *info, ftnlen trana_len, ftnlen tranb_len) | |||
{ | |||
/* System generated locals */ | |||
int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, | |||
i__3, i__4; | |||
double d__1, d__2; | |||
doublecomplex z__1, z__2, z__3, z__4; | |||
/* Builtin functions */ | |||
double d_imag(doublecomplex *); | |||
void d_cnjg(doublecomplex *, doublecomplex *); | |||
/* Local variables */ | |||
static int j, k, l; | |||
static doublecomplex a11; | |||
static double db; | |||
static doublecomplex x11; | |||
static double da11; | |||
static doublecomplex vec; | |||
static double dum[1], eps, sgn, smin; | |||
static doublecomplex suml, sumr; | |||
extern int lsame_(char *, char *, ftnlen, ftnlen); | |||
/* Double Complex */ doublecomplex zdotc_(int *, | |||
doublecomplex *, int *, doublecomplex *, int *), zdotu_( | |||
int *, doublecomplex *, int *, | |||
doublecomplex *, int *); | |||
extern /* Subroutine */ int dlabad_(double *, double *); | |||
extern double dlamch_(char *, ftnlen); | |||
static double scaloc; | |||
extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); | |||
extern double zlange_(char *, int *, int *, doublecomplex *, | |||
int *, double *, ftnlen); | |||
static double bignum; | |||
extern /* Subroutine */ int zdscal_(int *, double *, | |||
doublecomplex *, int *); | |||
/* Double Complex */ doublecomplex zladiv_(doublecomplex *, | |||
doublecomplex *); | |||
static int notrna, notrnb; | |||
static double smlnum; | |||
/* Parameter adjustments */ | |||
a_dim1 = *lda; | |||
a_offset = 1 + a_dim1; | |||
a -= a_offset; | |||
b_dim1 = *ldb; | |||
b_offset = 1 + b_dim1; | |||
b -= b_offset; | |||
c_dim1 = *ldc; | |||
c_offset = 1 + c_dim1; | |||
c__ -= c_offset; | |||
/* Function Body */ | |||
notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); | |||
notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); | |||
*info = 0; | |||
if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) { | |||
*info = -1; | |||
} else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { | |||
*info = -2; | |||
} else if (*isgn != 1 && *isgn != -1) { | |||
*info = -3; | |||
} else if (*m < 0) { | |||
*info = -4; | |||
} else if (*n < 0) { | |||
*info = -5; | |||
} else if (*lda < max(1,*m)) { | |||
*info = -7; | |||
} else if (*ldb < max(1,*n)) { | |||
*info = -9; | |||
} else if (*ldc < max(1,*m)) { | |||
*info = -11; | |||
} | |||
if (*info != 0) { | |||
i__1 = -(*info); | |||
xerbla_("ZTRSY2", &i__1, (ftnlen)6); | |||
return; | |||
} | |||
*scale = 1.; | |||
if (*m == 0 || *n == 0) { | |||
return; | |||
} | |||
eps = dlamch_("P", (ftnlen)1); | |||
smlnum = dlamch_("S", (ftnlen)1); | |||
bignum = 1. / smlnum; | |||
dlabad_(&smlnum, &bignum); | |||
smlnum = smlnum * (double) (*m * *n) / eps; | |||
bignum = 1. / smlnum; | |||
/* Computing MAX */ | |||
d__1 = smlnum, d__2 = eps * zlange_("M", m, m, &a[a_offset], lda, dum, ( | |||
ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * zlange_("M", n, n, | |||
&b[b_offset], ldb, dum, (ftnlen)1); | |||
smin = max(d__1,d__2); | |||
sgn = (double) (*isgn); | |||
if (notrna && notrnb) { | |||
i__1 = *n; | |||
for (l = 1; l <= i__1; ++l) { | |||
for (k = *m; k >= 1; --k) { | |||
i__2 = *m - k; | |||
/* Computing MIN */ | |||
i__3 = k + 1; | |||
/* Computing MIN */ | |||
i__4 = k + 1; | |||
z__1 = zdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[ | |||
min(i__4,*m) + l * c_dim1], &c__1); | |||
suml.r = z__1.r, suml.i = z__1.i; | |||
i__2 = l - 1; | |||
z__1 = zdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
, &c__1); | |||
sumr.r = z__1.r, sumr.i = z__1.i; | |||
i__2 = k + l * c_dim1; | |||
z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i; | |||
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; | |||
z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i; | |||
vec.r = z__1.r, vec.i = z__1.i; | |||
scaloc = 1.; | |||
i__2 = k + k * a_dim1; | |||
i__3 = l + l * b_dim1; | |||
z__2.r = sgn * b[i__3].r, z__2.i = sgn * b[i__3].i; | |||
z__1.r = a[i__2].r + z__2.r, z__1.i = a[i__2].i + z__2.i; | |||
a11.r = z__1.r, a11.i = z__1.i; | |||
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( | |||
d__2)); | |||
if (da11 <= smin) { | |||
a11.r = smin, a11.i = 0.; | |||
da11 = smin; | |||
*info = 1; | |||
} | |||
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( | |||
d__2)); | |||
if (da11 < 1. && db > 1.) { | |||
if (db > bignum * da11) { | |||
scaloc = 1. / db; | |||
} | |||
} | |||
z__3.r = scaloc, z__3.i = 0.; | |||
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * | |||
z__3.i + vec.i * z__3.r; | |||
z__1 = zladiv_(&z__2, &a11); | |||
x11.r = z__1.r, x11.i = z__1.i; | |||
if (scaloc != 1.) { | |||
i__2 = *n; | |||
for (j = 1; j <= i__2; ++j) { | |||
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
/* L10: */ | |||
} | |||
*scale *= scaloc; | |||
} | |||
i__2 = k + l * c_dim1; | |||
c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
/* L20: */ | |||
} | |||
/* L30: */ | |||
} | |||
} else if (! notrna && notrnb) { | |||
i__1 = *n; | |||
for (l = 1; l <= i__1; ++l) { | |||
i__2 = *m; | |||
for (k = 1; k <= i__2; ++k) { | |||
i__3 = k - 1; | |||
z__1 = zdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
c_dim1 + 1], &c__1); | |||
suml.r = z__1.r, suml.i = z__1.i; | |||
i__3 = l - 1; | |||
z__1 = zdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
, &c__1); | |||
sumr.r = z__1.r, sumr.i = z__1.i; | |||
i__3 = k + l * c_dim1; | |||
z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i; | |||
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; | |||
z__1.r = c__[i__3].r - z__2.r, z__1.i = c__[i__3].i - z__2.i; | |||
vec.r = z__1.r, vec.i = z__1.i; | |||
scaloc = 1.; | |||
d_cnjg(&z__2, &a[k + k * a_dim1]); | |||
i__3 = l + l * b_dim1; | |||
z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i; | |||
z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; | |||
a11.r = z__1.r, a11.i = z__1.i; | |||
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( | |||
d__2)); | |||
if (da11 <= smin) { | |||
a11.r = smin, a11.i = 0.; | |||
da11 = smin; | |||
*info = 1; | |||
} | |||
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( | |||
d__2)); | |||
if (da11 < 1. && db > 1.) { | |||
if (db > bignum * da11) { | |||
scaloc = 1. / db; | |||
} | |||
} | |||
z__3.r = scaloc, z__3.i = 0.; | |||
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * | |||
z__3.i + vec.i * z__3.r; | |||
z__1 = zladiv_(&z__2, &a11); | |||
x11.r = z__1.r, x11.i = z__1.i; | |||
if (scaloc != 1.) { | |||
i__3 = *n; | |||
for (j = 1; j <= i__3; ++j) { | |||
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
/* L40: */ | |||
} | |||
*scale *= scaloc; | |||
} | |||
i__3 = k + l * c_dim1; | |||
c__[i__3].r = x11.r, c__[i__3].i = x11.i; | |||
/* L50: */ | |||
} | |||
/* L60: */ | |||
} | |||
} else if (! notrna && ! notrnb) { | |||
for (l = *n; l >= 1; --l) { | |||
i__1 = *m; | |||
for (k = 1; k <= i__1; ++k) { | |||
i__2 = k - 1; | |||
z__1 = zdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
c_dim1 + 1], &c__1); | |||
suml.r = z__1.r, suml.i = z__1.i; | |||
i__2 = *n - l; | |||
/* Computing MIN */ | |||
i__3 = l + 1; | |||
/* Computing MIN */ | |||
i__4 = l + 1; | |||
z__1 = zdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[ | |||
l + min(i__4,*n) * b_dim1], ldb); | |||
sumr.r = z__1.r, sumr.i = z__1.i; | |||
i__2 = k + l * c_dim1; | |||
d_cnjg(&z__4, &sumr); | |||
z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i; | |||
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; | |||
z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i; | |||
vec.r = z__1.r, vec.i = z__1.i; | |||
scaloc = 1.; | |||
i__2 = k + k * a_dim1; | |||
i__3 = l + l * b_dim1; | |||
z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i; | |||
z__2.r = a[i__2].r + z__3.r, z__2.i = a[i__2].i + z__3.i; | |||
d_cnjg(&z__1, &z__2); | |||
a11.r = z__1.r, a11.i = z__1.i; | |||
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( | |||
d__2)); | |||
if (da11 <= smin) { | |||
a11.r = smin, a11.i = 0.; | |||
da11 = smin; | |||
*info = 1; | |||
} | |||
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( | |||
d__2)); | |||
if (da11 < 1. && db > 1.) { | |||
if (db > bignum * da11) { | |||
scaloc = 1. / db; | |||
} | |||
} | |||
z__3.r = scaloc, z__3.i = 0.; | |||
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * | |||
z__3.i + vec.i * z__3.r; | |||
z__1 = zladiv_(&z__2, &a11); | |||
x11.r = z__1.r, x11.i = z__1.i; | |||
if (scaloc != 1.) { | |||
i__2 = *n; | |||
for (j = 1; j <= i__2; ++j) { | |||
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
/* L70: */ | |||
} | |||
*scale *= scaloc; | |||
} | |||
i__2 = k + l * c_dim1; | |||
c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
/* L80: */ | |||
} | |||
/* L90: */ | |||
} | |||
} else if (notrna && ! notrnb) { | |||
for (l = *n; l >= 1; --l) { | |||
for (k = *m; k >= 1; --k) { | |||
i__1 = *m - k; | |||
/* Computing MIN */ | |||
i__2 = k + 1; | |||
/* Computing MIN */ | |||
i__3 = k + 1; | |||
z__1 = zdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[ | |||
min(i__3,*m) + l * c_dim1], &c__1); | |||
suml.r = z__1.r, suml.i = z__1.i; | |||
i__1 = *n - l; | |||
/* Computing MIN */ | |||
i__2 = l + 1; | |||
/* Computing MIN */ | |||
i__3 = l + 1; | |||
z__1 = zdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[ | |||
l + min(i__3,*n) * b_dim1], ldb); | |||
sumr.r = z__1.r, sumr.i = z__1.i; | |||
i__1 = k + l * c_dim1; | |||
d_cnjg(&z__4, &sumr); | |||
z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i; | |||
z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; | |||
z__1.r = c__[i__1].r - z__2.r, z__1.i = c__[i__1].i - z__2.i; | |||
vec.r = z__1.r, vec.i = z__1.i; | |||
scaloc = 1.; | |||
i__1 = k + k * a_dim1; | |||
d_cnjg(&z__3, &b[l + l * b_dim1]); | |||
z__2.r = sgn * z__3.r, z__2.i = sgn * z__3.i; | |||
z__1.r = a[i__1].r + z__2.r, z__1.i = a[i__1].i + z__2.i; | |||
a11.r = z__1.r, a11.i = z__1.i; | |||
da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( | |||
d__2)); | |||
if (da11 <= smin) { | |||
a11.r = smin, a11.i = 0.; | |||
da11 = smin; | |||
*info = 1; | |||
} | |||
db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( | |||
d__2)); | |||
if (da11 < 1. && db > 1.) { | |||
if (db > bignum * da11) { | |||
scaloc = 1. / db; | |||
} | |||
} | |||
z__3.r = scaloc, z__3.i = 0.; | |||
z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * | |||
z__3.i + vec.i * z__3.r; | |||
z__1 = zladiv_(&z__2, &a11); | |||
x11.r = z__1.r, x11.i = z__1.i; | |||
if (scaloc != 1.) { | |||
i__1 = *n; | |||
for (j = 1; j <= i__1; ++j) { | |||
zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
/* L100: */ | |||
} | |||
*scale *= scaloc; | |||
} | |||
i__1 = k + l * c_dim1; | |||
c__[i__1].r = x11.r, c__[i__1].i = x11.i; | |||
/* L110: */ | |||
} | |||
/* L120: */ | |||
} | |||
} | |||
return; | |||
} |
@@ -0,0 +1,107 @@ | |||
#include "relapack.h" | |||
static void RELAPACK_ztrtri_rec(const char *, const char *, const int *, | |||
double *, const int *, int *); | |||
/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. | |||
* | |||
* This routine is functionally equivalent to LAPACK's ztrtri. | |||
* For details on its interface, see | |||
* http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html | |||
* */ | |||
void RELAPACK_ztrtri( | |||
const char *uplo, const char *diag, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
) { | |||
// Check arguments | |||
const int lower = LAPACK(lsame)(uplo, "L"); | |||
const int upper = LAPACK(lsame)(uplo, "U"); | |||
const int nounit = LAPACK(lsame)(diag, "N"); | |||
const int unit = LAPACK(lsame)(diag, "U"); | |||
*info = 0; | |||
if (!lower && !upper) | |||
*info = -1; | |||
else if (!nounit && !unit) | |||
*info = -2; | |||
else if (*n < 0) | |||
*info = -3; | |||
else if (*ldA < MAX(1, *n)) | |||
*info = -5; | |||
if (*info) { | |||
const int minfo = -*info; | |||
LAPACK(xerbla)("ZTRTRI", &minfo); | |||
return; | |||
} | |||
// Clean char * arguments | |||
const char cleanuplo = lower ? 'L' : 'U'; | |||
const char cleandiag = nounit ? 'N' : 'U'; | |||
// check for singularity | |||
if (nounit) { | |||
int i; | |||
for (i = 0; i < *n; i++) | |||
if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { | |||
*info = i; | |||
return; | |||
} | |||
} | |||
// Recursive kernel | |||
RELAPACK_ztrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); | |||
} | |||
/** ztrtri's recursive compute kernel */ | |||
static void RELAPACK_ztrtri_rec( | |||
const char *uplo, const char *diag, const int *n, | |||
double *A, const int *ldA, | |||
int *info | |||
){ | |||
if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) { | |||
// Unblocked | |||
LAPACK(ztrti2)(uplo, diag, n, A, ldA, info); | |||
return; | |||
} | |||
// Constants | |||
const double ONE[] = { 1. }; | |||
const double MONE[] = { -1. }; | |||
// Splitting | |||
const int n1 = ZREC_SPLIT(*n); | |||
const int n2 = *n - n1; | |||
// A_TL A_TR | |||
// A_BL A_BR | |||
double *const A_TL = A; | |||
double *const A_TR = A + 2 * *ldA * n1; | |||
double *const A_BL = A + 2 * n1; | |||
double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
// recursion(A_TL) | |||
RELAPACK_ztrtri_rec(uplo, diag, &n1, A_TL, ldA, info); | |||
if (*info) | |||
return; | |||
if (*uplo == 'L') { | |||
// A_BL = - A_BL * A_TL | |||
BLAS(ztrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); | |||
// A_BL = A_BR \ A_BL | |||
BLAS(ztrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
} else { | |||
// A_TR = - A_TL * A_TR | |||
BLAS(ztrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); | |||
// A_TR = A_TR / A_BR | |||
BLAS(ztrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
} | |||
// recursion(A_BR) | |||
RELAPACK_ztrtri_rec(uplo, diag, &n2, A_BR, ldA, info); | |||
if (*info) | |||
*info += n1; | |||
} |
@@ -0,0 +1,48 @@ | |||
ReLAPACK Test Suite | |||
=================== | |||
This test suite compares ReLAPACK's recursive routines with LAPACK's compute | |||
routines in terms of accuracy: For each test-case, we execute both ReLAPACK's | |||
and LAPACK's routine on the same data and consider the numerical difference | |||
between the two solutions. | |||
This difference is computed as the maximum error across all elements of the | |||
routine's outputs, where the error for each element is the minimum of the | |||
absolute error and the relative error (with LAPACK as the reference). If the | |||
error is below the error bound configured in `config.h` (default: 1e-5 for | |||
single precision and 1e-14 for double precision) the test-case is considered as | |||
passed. | |||
For each routine the test-cases cover a variety of input argument combinations | |||
to ensure that ReLAPACK's routines match the functionality of LAPACK for all use | |||
cases. | |||
The matrix size for all experiments (default: 100) can also be specified in | |||
`config.h`. | |||
Implementation | |||
-------------- | |||
`test.h` provides the framework for our tests: It provides macros that allow to | |||
generalize the tests for each operation in one file covering all data-types. | |||
Such a file is structured as follows: | |||
* All matrices required by the test-cases are declared globally. For each | |||
matrix, an array of two pointers is declared; one for the matrix copy passed | |||
to ReLAPACK and one passed to LAPACK. | |||
* `tests()` contains the main control flow: it allocates (and later frees) the | |||
copies of the globally declared matrices. It then defines the macro | |||
`ROUTINE` to contain the name of the currently tested routine. | |||
It then uses the macro `TEST` to perform the test-cases. | |||
It receives the arguments of the routine, where matrices of which ReLAPACK | |||
and LAPACK receive a copy are index with `i`. (Example: `TEST("L", &n, A[i], | |||
&n, info);`) | |||
* The macro `TEST` first calls `pre()`, which initializes all relevant | |||
matrices, then executes the ReLAPACK algorithm on the matrices with `i` = `0` | |||
and then the LAPACK counter part with `i` = `1`. It then calls `post()`, | |||
which computes the difference between the results, storing it in `error`. | |||
Finally, the error is printed out and compared to the error bound. | |||
If all test-cases pass the error bound test, the program will have a `0` return | |||
value, otherwise it is `1`, indicating an error. |
@@ -0,0 +1,13 @@ | |||
#ifndef TEST_CONFIG_H | |||
#define TEST_CONFIG_H | |||
// error bound for single and single complex routines | |||
#define SINGLE_ERR_BOUND 1e-4 | |||
// error bound for double an double complex routines | |||
#define DOUBLE_ERR_BOUND 1e-13 | |||
// size of test matrices | |||
#define TEST_SIZE 100 | |||
#endif /* TEST_CONFIG_H */ |
@@ -0,0 +1,64 @@ | |||
#ifndef LAPACK_H2 | |||
#define LAPACK_H2 | |||
#include "../config.h" | |||
void LAPACK(slauum)(const char *, const int *, float *, const int *, int *); | |||
void LAPACK(dlauum)(const char *, const int *, double *, const int *, int *); | |||
void LAPACK(clauum)(const char *, const int *, float *, const int *, int *); | |||
void LAPACK(zlauum)(const char *, const int *, double *, const int *, int *); | |||
void LAPACK(strtri)(const char *, const char *, const int *, float *, const int *, int *); | |||
void LAPACK(dtrtri)(const char *, const char *, const int *, double *, const int *, int *); | |||
void LAPACK(ctrtri)(const char *, const char *, const int *, float *, const int *, int *); | |||
void LAPACK(ztrtri)(const char *, const char *, const int *, double *, const int *, int *); | |||
void LAPACK(spotrf)(const char *, const int *, float *, const int *, int *); | |||
void LAPACK(dpotrf)(const char *, const int *, double *, const int *, int *); | |||
void LAPACK(cpotrf)(const char *, const int *, float *, const int *, int *); | |||
void LAPACK(zpotrf)(const char *, const int *, double *, const int *, int *); | |||
void LAPACK(spbtrf)(const char *, const int *, const int *, float *, const int *, int *); | |||
void LAPACK(dpbtrf)(const char *, const int *, const int *, double *, const int *, int *); | |||
void LAPACK(cpbtrf)(const char *, const int *, const int *, float *, const int *, int *); | |||
void LAPACK(zpbtrf)(const char *, const int *, const int *, double *, const int *, int *); | |||
void LAPACK(ssytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void LAPACK(dsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void LAPACK(csytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void LAPACK(chetrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void LAPACK(zsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void LAPACK(zhetrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void LAPACK(ssytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void LAPACK(dsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void LAPACK(csytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void LAPACK(chetrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
void LAPACK(zsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void LAPACK(zhetrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
void LAPACK(sgetrf)(const int *, const int *, float *, const int *, int *, int *); | |||
void LAPACK(dgetrf)(const int *, const int *, double *, const int *, int *, int *); | |||
void LAPACK(cgetrf)(const int *, const int *, float *, const int *, int *, int *); | |||
void LAPACK(zgetrf)(const int *, const int *, double *, const int *, int *, int *); | |||
void LAPACK(sgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
void LAPACK(dgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
void LAPACK(cgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
void LAPACK(zgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
void LAPACK(ssygst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
void LAPACK(dsygst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
void LAPACK(chegst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
void LAPACK(zhegst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
void LAPACK(strsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
void LAPACK(dtrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
void LAPACK(ctrsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
void LAPACK(ztrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
void LAPACK(stgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); | |||
void LAPACK(dtgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); | |||
void LAPACK(ctgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); | |||
void LAPACK(ztgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); | |||
#endif /* LAPACK_H2 */ |
@@ -0,0 +1,136 @@ | |||
#ifndef TEST_H | |||
#define TEST_H | |||
#include "../config.h" | |||
#include "config.h" | |||
#if BLAS_UNDERSCORE | |||
#define BLAS(routine) routine ## _ | |||
#else | |||
#define BLAS(routine) routine | |||
#endif | |||
#if LAPACK_UNDERSCORE | |||
#define LAPACK(routine) routine ## _ | |||
#else | |||
#define LAPACK(routine) routine | |||
#endif | |||
#include "../inc/relapack.h" | |||
#include "lapack.h" | |||
#include "util.h" | |||
#include <stdlib.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
// some name mangling macros | |||
#define CAT(A, B) A ## B | |||
#define XCAT(A, B) CAT(A, B) | |||
#define XLAPACK(X) LAPACK(X) | |||
#define XRELAPACK(X) XCAT(RELAPACK_, X) | |||
#define STR(X) #X | |||
#define XSTR(X) STR(X) | |||
// default setup and error computation names: pre() and post() | |||
#define PRE pre | |||
#define POST post | |||
// TEST macro: | |||
// run setup (pre()), ReLAPACK routine (i = 0), LAPACK routine (i = 1), compute | |||
// error (post()), check error bound, and print setup and error | |||
#define TEST(...) \ | |||
PRE(); \ | |||
i = 0; \ | |||
XRELAPACK(ROUTINE)(__VA_ARGS__); \ | |||
i = 1; \ | |||
XLAPACK(ROUTINE)(__VA_ARGS__); \ | |||
POST(); \ | |||
fail |= error > ERR_BOUND; \ | |||
printf("%s(%s)\t%g\n", XSTR(ROUTINE), #__VA_ARGS__, error); | |||
// generalized datatype treatment: DT_PREFIX determines the type s, d, c, or z | |||
#define XPREF(A) XCAT(DT_PREFIX, A) | |||
// matrix generation and error computation routines | |||
#define x2matgen XPREF(2matgen) | |||
#define x2vecerr XPREF(2vecerr) | |||
// error bounds | |||
#define ERR_BOUND XPREF(ERR_BOUND_) | |||
#define sERR_BOUND_ SINGLE_ERR_BOUND | |||
#define dERR_BOUND_ DOUBLE_ERR_BOUND | |||
#define cERR_BOUND_ SINGLE_ERR_BOUND | |||
#define zERR_BOUND_ DOUBLE_ERR_BOUND | |||
// C datatypes | |||
#define datatype XPREF(datatype_) | |||
#define sdatatype_ float | |||
#define ddatatype_ double | |||
#define cdatatype_ float | |||
#define zdatatype_ double | |||
// number of C datatype elements per element | |||
#define x1 XPREF(DT_MULT) | |||
#define sDT_MULT 1 | |||
#define dDT_MULT 1 | |||
#define cDT_MULT 2 | |||
#define zDT_MULT 2 | |||
// typed allocations | |||
#define xmalloc XPREF(malloc) | |||
#define imalloc(S) malloc((S) * sizeof(int)) | |||
#define smalloc(S) malloc((S) * sizeof(float)) | |||
#define dmalloc(S) malloc((S) * sizeof(double)) | |||
#define cmalloc(S) malloc((S) * 2 * sizeof(float)) | |||
#define zmalloc(S) malloc((S) * 2 * sizeof(double)) | |||
// transpositions | |||
#define xCTRANS XPREF(CTRANS) | |||
#define sCTRANS "T" | |||
#define dCTRANS "T" | |||
#define cCTRANS "C" | |||
#define zCTRANS "C" | |||
// some constants | |||
#define MONE XPREF(MONE) | |||
const float sMONE[] = { -1. }; | |||
const double dMONE[] = { -1. }; | |||
const float cMONE[] = { -1., 0. }; | |||
const double zMONE[] = { -1., 0. }; | |||
#define ZERO XPREF(ZERO) | |||
const float sZERO[] = { 0. }; | |||
const double dZERO[] = { 0. }; | |||
const float cZERO[] = { 0., 0. }; | |||
const double zZERO[] = { 0., 0. }; | |||
#define ONE XPREF(ONE) | |||
const float sONE[] = { 1. }; | |||
const double dONE[] = { 1. }; | |||
const float cONE[] = { 1., 0. }; | |||
const double zONE[] = { 1., 0. }; | |||
const int iMONE[] = { -1 }; | |||
const int iZERO[] = { 0 }; | |||
const int iONE[] = { 1 }; | |||
const int iTWO[] = { 2 }; | |||
const int iTHREE[] = { 3 }; | |||
const int iFOUR[] = { 4 }; | |||
void tests(); | |||
// global variables (used in tests(), pre(), and post()) | |||
int i, n, n2, fail; | |||
double error; | |||
int main(int argc, char* argv[]) { | |||
n = TEST_SIZE; | |||
n2 = (3 * n) / 4; | |||
fail = 0; | |||
tests(); | |||
return fail; | |||
} | |||
#endif /* TEST_H */ |
@@ -0,0 +1,116 @@ | |||
#include "util.h" | |||
#include <stdlib.h> | |||
#include <time.h> | |||
#include <math.h> | |||
#define MAX(a, b) ((a) > (b) ? (a) : (b)) | |||
#define MIN(a, b) ((a) < (b) ? (a) : (b)) | |||
/////////////////////// | |||
// matrix generation // | |||
/////////////////////// | |||
// Each routine x2matgen is passed the size (m, n) of the desired matrix and | |||
// geneartes two copies of such a matrix in in its output arguments A and B. | |||
// The generated matrices is filled with random entries in [0, 1[ (+i*[0, 1[ in | |||
// the complex case). Then m is added to the diagonal; this is numerically | |||
// favorable for routines working with triangular and symmetric matrices. For | |||
// the same reason the imaginary part of the diagonal is set to 0. | |||
void s2matgen(const int m, const int n, float *A, float *B) { | |||
srand(time(NULL) + (size_t) A); | |||
int i, j; | |||
for (i = 0; i < m; i++) | |||
for (j = 0; j < n; j++) | |||
A[i + m * j] = B[i + m * j] = (float) rand() / RAND_MAX + m * (i == j); | |||
} | |||
void d2matgen(const int m, const int n, double *A, double *B) { | |||
srand(time(NULL) + (size_t) A); | |||
int i, j; | |||
for (i = 0; i < m; i++) | |||
for (j = 0; j < n; j++) | |||
A[i + m * j] = B[i + m * j] = (double) rand() / RAND_MAX + m * (i == j); | |||
} | |||
void c2matgen(const int m, const int n, float *A, float *B) { | |||
srand(time(NULL) + (size_t) A); | |||
int i, j; | |||
for (i = 0; i < m; i++) | |||
for (j = 0; j < n; j++) { | |||
A[2* (i + m * j)] = B[2 * (i + m * j)] = (float) rand() / RAND_MAX + m * (i == j); | |||
A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((float) rand() / RAND_MAX) * (i != j); | |||
} | |||
} | |||
void z2matgen(const int m, const int n, double *A, double *B) { | |||
srand(time(NULL) + (size_t) A); | |||
int i, j; | |||
for (i = 0; i < m; i++) | |||
for (j = 0; j < n; j++) { | |||
A[2* (i + m * j)] = B[2 * (i + m * j)] = (double) rand() / RAND_MAX + m * (i == j); | |||
A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((double) rand() / RAND_MAX) * (i != j); | |||
} | |||
} | |||
//////////////////////// | |||
// error computations // | |||
//////////////////////// | |||
// Each routine x2vecerrr is passed a vector lengh n and two vectors x and y. | |||
// It returns the maximum of the element-wise error between these two vectors. | |||
// This error is the minimum of the absolute difference and the relative | |||
// differene with respect to y. | |||
double i2vecerr(const int n, const int *x, const int *y) { | |||
double error = 0; | |||
int i; | |||
for (i = 0; i < n; i++) { | |||
double nom = abs(x[i] - y[i]); | |||
double den = abs(y[i]); | |||
error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); | |||
} | |||
return error; | |||
} | |||
double s2vecerr(const int n, const float *x, const float *y) { | |||
float error = 0; | |||
int i; | |||
for (i = 0; i < n; i++) { | |||
double nom = fabs((double) x[i] - y[i]); | |||
double den = fabs(y[i]); | |||
error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); | |||
} | |||
return error; | |||
} | |||
double d2vecerr(const int n, const double *x, const double *y) { | |||
double error = 0; | |||
int i; | |||
for (i = 0; i < n; i++) { | |||
double nom = fabs(x[i] - y[i]); | |||
double den = fabs(y[i]); | |||
error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); | |||
} | |||
return error; | |||
} | |||
double c2vecerr(const int n, const float *x, const float *y) { | |||
double error = 0; | |||
int i; | |||
for (i = 0; i < n; i++) { | |||
double nom = sqrt(((double) x[2 * i] - y[2 * i]) * ((double) x[2 * i] - y[2 * i]) + ((double) x[2 * i + 1] - y[2 * i + 1]) * ((double) x[2 * i + 1] - y[2 * i + 1])); | |||
double den = sqrt((double) y[2 * i] * y[2 * i] + (double) y[2 * i + 1] * y[2 * i + 1]); | |||
error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); | |||
} | |||
return error; | |||
} | |||
double z2vecerr(const int n, const double *x, const double *y) { | |||
double error = 0; | |||
int i; | |||
for (i = 0; i < n; i++) { | |||
double nom = sqrt((x[2 * i] - y[2 * i]) * (x[2 * i] - y[2 * i]) + (x[2 * i + 1] - y[2 * i + 1]) * (x[2 * i + 1] - y[2 * i + 1])); | |||
double den = sqrt(y[2 * i] * y[2 * i] + y[2 * i + 1] * y[2 * i + 1]); | |||
error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); | |||
} | |||
return error; | |||
} |
@@ -0,0 +1,15 @@ | |||
#ifndef TEST_UTIL_H | |||
#define TEST_UTIL_H | |||
void s2matgen(int, int, float *, float *); | |||
void d2matgen(int, int, double *, double *); | |||
void c2matgen(int, int, float *, float *); | |||
void z2matgen(int, int, double *, double *); | |||
double i2vecerr(int, const int *, const int *); | |||
double s2vecerr(int, const float *, const float *); | |||
double d2vecerr(int, const double *, const double *); | |||
double c2vecerr(int, const float *, const float *); | |||
double z2vecerr(int, const double *, const double *); | |||
#endif /* TEST_UTIL_H */ |
@@ -0,0 +1,43 @@ | |||
#include "test.h" | |||
datatype *A[2]; | |||
int *ipiv[2], info; | |||
int kl, ku, ld; | |||
void pre() { | |||
int i; | |||
x2matgen(ld, n, A[0], A[1]); | |||
for (i = 0; i < n; i++) { | |||
// set diagonal | |||
A[0][x1 * (i + ld * i)] = | |||
A[1][x1 * (i + ld * i)] = (datatype) rand() / RAND_MAX; | |||
} | |||
memset(ipiv[0], 0, n * sizeof(int)); | |||
memset(ipiv[1], 0, n * sizeof(int)); | |||
} | |||
void post() { | |||
error = x2vecerr(ld * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); | |||
} | |||
void tests() { | |||
kl = n - 10; | |||
ku = n; | |||
ld = 2 * kl + ku + 1; | |||
A[0] = xmalloc(ld * n); | |||
A[1] = xmalloc(ld * n); | |||
ipiv[0] = imalloc(n); | |||
ipiv[1] = imalloc(n); | |||
#define ROUTINE XPREF(gbtrf) | |||
TEST(&n, &n, &kl, &ku, A[i], &ld, ipiv[i], &info); | |||
TEST(&n, &n2, &kl, &ku, A[i], &ld, ipiv[i], &info); | |||
TEST(&n2, &n, &kl, &ku, A[i], &ld, ipiv[i], &info); | |||
free(A[0]); | |||
free(A[1]); | |||
free(ipiv[0]); | |||
free(ipiv[1]); | |||
} |
@@ -0,0 +1,65 @@ | |||
#include "test.h" | |||
datatype *A[2], *B[2], *C[2], *Ctmp; | |||
int info; | |||
void pre() { | |||
x2matgen(n, n, A[0], A[1]); | |||
x2matgen(n, n, B[0], B[1]); | |||
x2matgen(n, n, C[0], C[1]); | |||
} | |||
void post() { | |||
error = x2vecerr(n * n, C[0], C[1]); | |||
} | |||
#define ROUTINE XPREF(gemmt) | |||
#define xlacpy XPREF(LAPACK(lacpy)) | |||
#define xgemm XPREF(BLAS(gemm)) | |||
extern void xlacpy(const char *, const int *, const int *, const datatype *, const int *, datatype *, const int *); | |||
extern void xgemm(const char *, const char *, const int *, const int *, const int *, const datatype *, const datatype *, const int *, const datatype *, const int *, const datatype *, const datatype *, const int*); | |||
void XLAPACK(ROUTINE)( | |||
const char *uplo, const char *transA, const char *transB, | |||
const int *n, const int *k, | |||
const datatype *alpha, const datatype *A, const int *ldA, | |||
const datatype *B, const int *ldB, | |||
const datatype *beta, datatype *C, const int *ldC | |||
) { | |||
xlacpy(uplo, n, n, C, ldC, Ctmp, n); | |||
xgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, Ctmp, n); | |||
xlacpy(uplo, n, n, Ctmp, ldC, C, n); | |||
} | |||
void tests() { | |||
A[0] = xmalloc(n * n); | |||
A[1] = xmalloc(n * n); | |||
B[0] = xmalloc(n * n); | |||
B[1] = xmalloc(n * n); | |||
C[0] = xmalloc(n * n); | |||
C[1] = xmalloc(n * n); | |||
Ctmp = xmalloc(n * n); | |||
TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n); | |||
TEST("L", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
TEST("L", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
TEST("L", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
TEST("L", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n); | |||
TEST("U", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
TEST("U", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
TEST("U", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
TEST("U", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); | |||
free(A[0]); | |||
free(A[1]); | |||
free(B[0]); | |||
free(B[1]); | |||
free(C[0]); | |||
free(C[1]); | |||
free(Ctmp); | |||
} |
@@ -0,0 +1,32 @@ | |||
#include "test.h" | |||
datatype *A[2]; | |||
int *ipiv[2], info; | |||
void pre() { | |||
x2matgen(n, n, A[0], A[1]); | |||
memset(ipiv[0], 0, n * sizeof(int)); | |||
memset(ipiv[1], 0, n * sizeof(int)); | |||
} | |||
void post() { | |||
error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); | |||
} | |||
void tests() { | |||
A[0] = xmalloc(n * n); | |||
A[1] = xmalloc(n * n); | |||
ipiv[0] = imalloc(n); | |||
ipiv[1] = imalloc(n); | |||
#define ROUTINE XPREF(getrf) | |||
TEST(&n, &n, A[i], &n, ipiv[i], &info); | |||
TEST(&n, &n2, A[i], &n, ipiv[i], &info); | |||
TEST(&n2, &n, A[i], &n, ipiv[i], &info); | |||
free(A[0]); | |||
free(A[1]); | |||
free(ipiv[0]); | |||
free(ipiv[1]); | |||
} |
@@ -0,0 +1,32 @@ | |||
#include "test.h" | |||
datatype *A[2], *B[2]; | |||
int info; | |||
void pre() { | |||
x2matgen(n, n, A[0], A[1]); | |||
x2matgen(n, n, B[0], B[1]); | |||
} | |||
void post() { | |||
error = x2vecerr(n * n, A[0], A[1]); | |||
} | |||
void tests() { | |||
A[0] = xmalloc(n * n); | |||
A[1] = xmalloc(n * n); | |||
B[0] = xmalloc(n * n); | |||
B[1] = xmalloc(n * n); | |||
#define ROUTINE XPREF(hegst) | |||
TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info); | |||
TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info); | |||
TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info); | |||
TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info); | |||
free(A[0]); | |||
free(A[1]); | |||
free(B[0]); | |||
free(B[1]); | |||
} |
@@ -0,0 +1,40 @@ | |||
#include "test.h" | |||
datatype *A[2], *Work; | |||
int *ipiv[2], info; | |||
void pre() { | |||
x2matgen(n, n, A[0], A[1]); | |||
memset(ipiv[0], 0, n * sizeof(int)); | |||
memset(ipiv[1], 0, n * sizeof(int)); | |||
} | |||
void post() { | |||
error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); | |||
} | |||
void tests() { | |||
const int lWork = n * n; | |||
A[0] = xmalloc(n * n); | |||
A[1] = xmalloc(n * n); | |||
ipiv[0] = imalloc(n); | |||
ipiv[1] = imalloc(n); | |||
Work = xmalloc(lWork); | |||
#define ROUTINE XPREF(hetrf) | |||
TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); | |||
TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); | |||
#undef ROUTINE | |||
#define ROUTINE XPREF(hetrf_rook) | |||
TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); | |||
TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); | |||
free(A[0]); | |||
free(A[1]); | |||
free(ipiv[0]); | |||
free(ipiv[1]); | |||
free(Work); | |||
} |
@@ -0,0 +1,25 @@ | |||
#include "test.h" | |||
datatype *A[2]; | |||
int info; | |||
void pre() { | |||
x2matgen(n, n, A[0], A[1]); | |||
} | |||
void post() { | |||
error = x2vecerr(n * n, A[0], A[1]); | |||
} | |||
void tests() { | |||
A[0] = xmalloc(n * n); | |||
A[1] = xmalloc(n * n); | |||
#define ROUTINE XPREF(lauum) | |||
TEST("L", &n, A[i], &n, &info); | |||
TEST("U", &n, A[i], &n, &info); | |||
free(A[0]); | |||
free(A[1]); | |||
} |
@@ -0,0 +1,40 @@ | |||
#include "test.h" | |||
datatype *A[2]; | |||
int info[2]; | |||
int n; | |||
void pre() { | |||
int i; | |||
x2matgen(n, n, A[0], A[1]); | |||
for (i = 0; i < n; i++) { | |||
// set diagonal | |||
A[0][x1 * (i + n * i)] = | |||
A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX; | |||
// set first row | |||
A[0][x1 * (n * i)] = | |||
A[1][x1 * (n * i)] = (datatype) rand() / RAND_MAX + n; | |||
} | |||
} | |||
void post() { | |||
error = x2vecerr(n * n, A[0], A[1]); | |||
} | |||
void tests() { | |||
A[0] = xmalloc(n * n); | |||
A[1] = xmalloc(n * n); | |||
#define ROUTINE XPREF(pbtrf) | |||
const int | |||
kd1 = n / 4, | |||
kd2 = n * 3 / 4; | |||
TEST("L", &n, &kd1, A[i], &n, &info[i]); | |||
TEST("L", &n, &kd2, A[i], &n, &info[i]); | |||
TEST("U", &n, &kd1, A[i] - x1 * kd1, &n, &info[i]); | |||
TEST("U", &n, &kd2, A[i] - x1 * kd2, &n, &info[i]); | |||
free(A[0]); | |||
free(A[1]); | |||
} |
@@ -0,0 +1,25 @@ | |||
#include "test.h" | |||
datatype *A[2]; | |||
int info; | |||
void pre() { | |||
x2matgen(n, n, A[0], A[1]); | |||
} | |||
void post() { | |||
error = x2vecerr(n * n, A[0], A[1]); | |||
} | |||
void tests() { | |||
A[0] = xmalloc(n * n); | |||
A[1] = xmalloc(n * n); | |||
#define ROUTINE XPREF(potrf) | |||
TEST("L", &n, A[i], &n, &info); | |||
TEST("U", &n, A[i], &n, &info); | |||
free(A[0]); | |||
free(A[1]); | |||
} |
@@ -0,0 +1,32 @@ | |||
#include "test.h" | |||
datatype *A[2], *B[2]; | |||
int info; | |||
void pre() { | |||
x2matgen(n, n, A[0], A[1]); | |||
x2matgen(n, n, B[0], B[1]); | |||
} | |||
void post() { | |||
error = x2vecerr(n * n, A[0], A[1]); | |||
} | |||
void tests() { | |||
A[0] = xmalloc(n * n); | |||
A[1] = xmalloc(n * n); | |||
B[0] = xmalloc(n * n); | |||
B[1] = xmalloc(n * n); | |||
#define ROUTINE XPREF(sygst) | |||
TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info); | |||
TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info); | |||
TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info); | |||
TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info); | |||
free(A[0]); | |||
free(A[1]); | |||
free(B[0]); | |||
free(B[1]); | |||
} |
@@ -0,0 +1,40 @@ | |||
#include "test.h" | |||
datatype *A[2], *Work; | |||
int *ipiv[2], info; | |||
void pre() { | |||
x2matgen(n, n, A[0], A[1]); | |||
memset(ipiv[0], 0, n * sizeof(int)); | |||
memset(ipiv[1], 0, n * sizeof(int)); | |||
} | |||
void post() { | |||
error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); | |||
} | |||
void tests() { | |||
const int lWork = n * n; | |||
A[0] = xmalloc(n * n); | |||
A[1] = xmalloc(n * n); | |||
ipiv[0] = imalloc(n); | |||
ipiv[1] = imalloc(n); | |||
Work = xmalloc(lWork); | |||
#define ROUTINE XPREF(sytrf) | |||
TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); | |||
TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); | |||
#undef ROUTINE | |||
#define ROUTINE XPREF(sytrf_rook) | |||
TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); | |||
TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); | |||
free(A[0]); | |||
free(A[1]); | |||
free(ipiv[0]); | |||
free(ipiv[1]); | |||
free(Work); | |||
} |