| @@ -0,0 +1,22 @@ | |||
| The MIT License (MIT) | |||
| Copyright (c) 2016 Elmar Peise | |||
| Permission is hereby granted, free of charge, to any person obtaining a copy | |||
| of this software and associated documentation files (the "Software"), to deal | |||
| in the Software without restriction, including without limitation the rights | |||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
| copies of the Software, and to permit persons to whom the Software is | |||
| furnished to do so, subject to the following conditions: | |||
| The above copyright notice and this permission notice shall be included in all | |||
| copies or substantial portions of the Software. | |||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
| SOFTWARE. | |||
| @@ -0,0 +1,64 @@ | |||
| TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| SRC = $(wildcard src/*.c) | |||
| OBJS = $(SRC:%.c=%.o) | |||
| TEST_SUITS = \ | |||
| slauum dlauum clauum zlauum \ | |||
| spotrf dpotrf cpotrf zpotrf \ | |||
| spbtrf dpbtrf cpbtrf zpbtrf \ | |||
| ssygst dsygst chegst zhegst \ | |||
| ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ | |||
| sgetrf dgetrf cgetrf zgetrf \ | |||
| sgbtrf dgbtrf cgbtrf zgbtrf \ | |||
| strsyl dtrsyl ctrsyl ztrsyl \ | |||
| stgsyl dtgsyl ctgsyl ztgsyl \ | |||
| sgemmt dgemmt cgemmt zgemmt | |||
| TESTS = $(TEST_SUITS:%=test/%.pass) # dummies | |||
| TEST_EXES = $(TEST_SUITS:%=test/%.x) | |||
| LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm | |||
| .SECONDARY: $(TEST_EXES) | |||
| .PHONY: test | |||
| # ReLAPACK compilation | |||
| libs: $(OBJS) | |||
| @echo "Building ReLAPACK library $(LIBNAME)" | |||
| $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) | |||
| $(RANLIB) $(TOPDIR)/$(LIBNAME) | |||
| %.o: %.c config.h | |||
| $(CC) $(CFLAGS) -c $< -o $@ | |||
| # ReLAPACK testing | |||
| test: $(TEST_EXES) $(TESTS) | |||
| @echo "passed all tests" | |||
| test/%.pass: test/%.x | |||
| @echo -n $*: | |||
| @./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<) | |||
| test/s%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
| $(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
| test/d%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
| $(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
| test/c%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
| $(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
| test/z%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
| $(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
| # cleaning up | |||
| clean: | |||
| rm -f $(OBJS) test/util.o test/*.x | |||
| @@ -0,0 +1,68 @@ | |||
| ReLAPACK | |||
| ======== | |||
| [](https://travis-ci.org/HPAC/ReLAPACK) | |||
| [Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK) | |||
| ReLAPACK offers a collection of recursive algorithms for many of LAPACK's | |||
| compute kernels. Since it preserves LAPACK's established interfaces, ReLAPACK | |||
| integrates effortlessly into existing application codes. ReLAPACK's routines | |||
| not only outperform the reference LAPACK but also improve upon the performance | |||
| of tuned implementations, such as OpenBLAS and MKL. | |||
| Coverage | |||
| -------- | |||
| For a detailed list of covered operations and an overview of operations to which | |||
| recursion is not efficiently applicable, see [coverage.md](coverage.md). | |||
| Installation | |||
| ------------ | |||
| To compile with the default configuration, simply run `make` to create the | |||
| library `librelapack.a`. | |||
| ### Linking with MKL | |||
| Note that to link with MKL, you currently need to set the flag | |||
| `COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and | |||
| `ztrsyl`. For further configuration options see [config.md](config.md). | |||
| ### Dependencies | |||
| ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked | |||
| kernels from [LAPACK](http://www.netlib.org/lapack/). There are many optimized | |||
| and machine specific implementations of these libraries, which are commonly | |||
| provided by hardware vendors or available as open source (e.g., | |||
| [OpenBLAS](http://www.openblas.net/)). | |||
| Testing | |||
| ------- | |||
| ReLAPACK's test suite compares its routines numerically with LAPACK's | |||
| counterparts. To set up the tests (located int `test/`) you need to specify | |||
| link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then | |||
| `make test` runs the tests. For details on the performed tests, see | |||
| [test/README.md](test/README.md). | |||
| Examples | |||
| -------- | |||
| Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the | |||
| covered routines applies directly to ReLAPACK. A few separate examples are | |||
| given in `examples/`. For details, see [examples/README.md](examples/README.md). | |||
| Citing | |||
| ------ | |||
| When referencing ReLAPACK, please cite the preprint of the paper | |||
| [Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763): | |||
| @article{relapack, | |||
| author = {Elmar Peise and Paolo Bientinesi}, | |||
| title = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection}, | |||
| journal = {CoRR}, | |||
| volume = {abs/1602.06763}, | |||
| year = {2016}, | |||
| url = {http://arxiv.org/abs/1602.06763}, | |||
| } | |||
| @@ -0,0 +1,208 @@ | |||
| #ifndef RELAPACK_CONFIG_H | |||
| #define RELAPACK_CONFIG_H | |||
| // ReLAPACK configuration file. | |||
| // See also config.md | |||
| /////////////////////////////// | |||
| // BLAS/LAPACK obect symbols // | |||
| /////////////////////////////// | |||
| // BLAS routines linked against have a trailing underscore | |||
| #define BLAS_UNDERSCORE 1 | |||
| // LAPACK routines linked against have a trailing underscore | |||
| #define LAPACK_UNDERSCORE BLAS_UNDERSCORE | |||
| // Complex BLAS/LAPACK routines return their result in the first argument | |||
| // This option must be enabled when linking to MKL for ctrsyl and ztrsyl to | |||
| // work. | |||
| #define COMPLEX_FUNCTIONS_AS_ROUTINES 0 | |||
| #ifdef F_INTERFACE_INTEL | |||
| #define COMPLEX_FUNCTIONS_AS_ROUTINES 1 | |||
| #endif | |||
| #define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| #define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| // The BLAS-like extension xgemmt is provided by an external library. | |||
| #define HAVE_XGEMMT 0 | |||
| //////////////////////////// | |||
| // Use malloc in ReLAPACK // | |||
| //////////////////////////// | |||
| #define ALLOW_MALLOC 1 | |||
| // allow malloc in xsygst for improved performance | |||
| #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC | |||
| // allow malloc in xsytrf if the passed work buffer is too small | |||
| #define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC | |||
| //////////////////////////////// | |||
| // LAPACK routine replacement // | |||
| //////////////////////////////// | |||
| // The following macros specify which routines are included in the library under | |||
| // LAPACK's symbol names: 1 included, 0 not included | |||
| #define INCLUDE_ALL 1 | |||
| #define INCLUDE_XLAUUM INCLUDE_ALL | |||
| #define INCLUDE_SLAUUM INCLUDE_XLAUUM | |||
| #define INCLUDE_DLAUUM INCLUDE_XLAUUM | |||
| #define INCLUDE_CLAUUM INCLUDE_XLAUUM | |||
| #define INCLUDE_ZLAUUM INCLUDE_XLAUUM | |||
| #define INCLUDE_XSYGST INCLUDE_ALL | |||
| #define INCLUDE_SSYGST INCLUDE_XSYGST | |||
| #define INCLUDE_DSYGST INCLUDE_XSYGST | |||
| #define INCLUDE_CHEGST INCLUDE_XSYGST | |||
| #define INCLUDE_ZHEGST INCLUDE_XSYGST | |||
| #define INCLUDE_XTRTRI INCLUDE_ALL | |||
| #define INCLUDE_STRTRI INCLUDE_XTRTRI | |||
| #define INCLUDE_DTRTRI INCLUDE_XTRTRI | |||
| #define INCLUDE_CTRTRI INCLUDE_XTRTRI | |||
| #define INCLUDE_ZTRTRI INCLUDE_XTRTRI | |||
| #define INCLUDE_XPOTRF INCLUDE_ALL | |||
| #define INCLUDE_SPOTRF INCLUDE_XPOTRF | |||
| #define INCLUDE_DPOTRF INCLUDE_XPOTRF | |||
| #define INCLUDE_CPOTRF INCLUDE_XPOTRF | |||
| #define INCLUDE_ZPOTRF INCLUDE_XPOTRF | |||
| #define INCLUDE_XPBTRF INCLUDE_ALL | |||
| #define INCLUDE_SPBTRF INCLUDE_XPBTRF | |||
| #define INCLUDE_DPBTRF INCLUDE_XPBTRF | |||
| #define INCLUDE_CPBTRF INCLUDE_XPBTRF | |||
| #define INCLUDE_ZPBTRF INCLUDE_XPBTRF | |||
| #define INCLUDE_XSYTRF INCLUDE_ALL | |||
| #define INCLUDE_SSYTRF INCLUDE_XSYTRF | |||
| #define INCLUDE_DSYTRF INCLUDE_XSYTRF | |||
| #define INCLUDE_CSYTRF INCLUDE_XSYTRF | |||
| #define INCLUDE_CHETRF INCLUDE_XSYTRF | |||
| #define INCLUDE_ZSYTRF INCLUDE_XSYTRF | |||
| #define INCLUDE_ZHETRF INCLUDE_XSYTRF | |||
| #define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF | |||
| #define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF | |||
| #define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF | |||
| #define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF | |||
| #define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF | |||
| #define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF | |||
| #define INCLUDE_XGETRF INCLUDE_ALL | |||
| #define INCLUDE_SGETRF INCLUDE_XGETRF | |||
| #define INCLUDE_DGETRF INCLUDE_XGETRF | |||
| #define INCLUDE_CGETRF INCLUDE_XGETRF | |||
| #define INCLUDE_ZGETRF INCLUDE_XGETRF | |||
| #define INCLUDE_XGBTRF INCLUDE_ALL | |||
| #define INCLUDE_SGBTRF INCLUDE_XGBTRF | |||
| #define INCLUDE_DGBTRF INCLUDE_XGBTRF | |||
| #define INCLUDE_CGBTRF INCLUDE_XGBTRF | |||
| #define INCLUDE_ZGBTRF INCLUDE_XGBTRF | |||
| #define INCLUDE_XTRSYL INCLUDE_ALL | |||
| #define INCLUDE_STRSYL INCLUDE_XTRSYL | |||
| #define INCLUDE_DTRSYL INCLUDE_XTRSYL | |||
| #define INCLUDE_CTRSYL INCLUDE_XTRSYL | |||
| #define INCLUDE_ZTRSYL INCLUDE_XTRSYL | |||
| #define INCLUDE_XTGSYL INCLUDE_ALL | |||
| #define INCLUDE_STGSYL INCLUDE_XTGSYL | |||
| #define INCLUDE_DTGSYL INCLUDE_XTGSYL | |||
| #define INCLUDE_CTGSYL INCLUDE_XTGSYL | |||
| #define INCLUDE_ZTGSYL INCLUDE_XTGSYL | |||
| #define INCLUDE_XGEMMT 0 | |||
| #define INCLUDE_SGEMMT INCLUDE_XGEMMT | |||
| #define INCLUDE_DGEMMT INCLUDE_XGEMMT | |||
| #define INCLUDE_CGEMMT INCLUDE_XGEMMT | |||
| #define INCLUDE_ZGEMMT INCLUDE_XGEMMT | |||
| ///////////////////// | |||
| // crossover sizes // | |||
| ///////////////////// | |||
| // default crossover size | |||
| #define CROSSOVER 24 | |||
| // individual crossover sizes | |||
| #define CROSSOVER_XLAUUM CROSSOVER | |||
| #define CROSSOVER_SLAUUM CROSSOVER_XLAUUM | |||
| #define CROSSOVER_DLAUUM CROSSOVER_XLAUUM | |||
| #define CROSSOVER_CLAUUM CROSSOVER_XLAUUM | |||
| #define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM | |||
| #define CROSSOVER_XSYGST CROSSOVER | |||
| #define CROSSOVER_SSYGST CROSSOVER_XSYGST | |||
| #define CROSSOVER_DSYGST CROSSOVER_XSYGST | |||
| #define CROSSOVER_CHEGST CROSSOVER_XSYGST | |||
| #define CROSSOVER_ZHEGST CROSSOVER_XSYGST | |||
| #define CROSSOVER_XTRTRI CROSSOVER | |||
| #define CROSSOVER_STRTRI CROSSOVER_XTRTRI | |||
| #define CROSSOVER_DTRTRI CROSSOVER_XTRTRI | |||
| #define CROSSOVER_CTRTRI CROSSOVER_XTRTRI | |||
| #define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI | |||
| #define CROSSOVER_XPOTRF CROSSOVER | |||
| #define CROSSOVER_SPOTRF CROSSOVER_XPOTRF | |||
| #define CROSSOVER_DPOTRF CROSSOVER_XPOTRF | |||
| #define CROSSOVER_CPOTRF CROSSOVER_XPOTRF | |||
| #define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF | |||
| #define CROSSOVER_XPBTRF CROSSOVER | |||
| #define CROSSOVER_SPBTRF CROSSOVER_XPBTRF | |||
| #define CROSSOVER_DPBTRF CROSSOVER_XPBTRF | |||
| #define CROSSOVER_CPBTRF CROSSOVER_XPBTRF | |||
| #define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF | |||
| #define CROSSOVER_XSYTRF CROSSOVER | |||
| #define CROSSOVER_SSYTRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_DSYTRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_CSYTRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_CHETRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_ZHETRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF | |||
| #define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF | |||
| #define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF | |||
| #define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF | |||
| #define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF | |||
| #define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF | |||
| #define CROSSOVER_XGETRF CROSSOVER | |||
| #define CROSSOVER_SGETRF CROSSOVER_XGETRF | |||
| #define CROSSOVER_DGETRF CROSSOVER_XGETRF | |||
| #define CROSSOVER_CGETRF CROSSOVER_XGETRF | |||
| #define CROSSOVER_ZGETRF CROSSOVER_XGETRF | |||
| #define CROSSOVER_XGBTRF CROSSOVER | |||
| #define CROSSOVER_SGBTRF CROSSOVER_XGBTRF | |||
| #define CROSSOVER_DGBTRF CROSSOVER_XGBTRF | |||
| #define CROSSOVER_CGBTRF CROSSOVER_XGBTRF | |||
| #define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF | |||
| #define CROSSOVER_XTRSYL CROSSOVER | |||
| #define CROSSOVER_STRSYL CROSSOVER_XTRSYL | |||
| #define CROSSOVER_DTRSYL CROSSOVER_XTRSYL | |||
| #define CROSSOVER_CTRSYL CROSSOVER_XTRSYL | |||
| #define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL | |||
| #define CROSSOVER_XTGSYL CROSSOVER | |||
| #define CROSSOVER_STGSYL CROSSOVER_XTGSYL | |||
| #define CROSSOVER_DTGSYL CROSSOVER_XTGSYL | |||
| #define CROSSOVER_CTGSYL CROSSOVER_XTGSYL | |||
| #define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL | |||
| // sytrf helper routine | |||
| #define CROSSOVER_XGEMMT CROSSOVER_XSYTRF | |||
| #define CROSSOVER_SGEMMT CROSSOVER_XGEMMT | |||
| #define CROSSOVER_DGEMMT CROSSOVER_XGEMMT | |||
| #define CROSSOVER_CGEMMT CROSSOVER_XGEMMT | |||
| #define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT | |||
| #endif /* RELAPACK_CONFIG_H */ | |||
| @@ -0,0 +1,87 @@ | |||
| RELAPACK Configuration | |||
| ====================== | |||
| ReLAPACK has two configuration files: `make.inc`, which is included by the | |||
| Makefile, and `config.h` which is included in the source files. | |||
| Build and Testing Environment | |||
| ----------------------------- | |||
| The build environment (compiler and flags) and the test configuration (linker | |||
| flags for BLAS and LAPACK) are specified in `make.inc`. The test matrix size | |||
| and error bounds are defined in `test/config.h`. | |||
| The library `librelapack.a` is compiled by invoking `make`. The tests are | |||
| performed by either `make test` or calling `make` in the test folder. | |||
| BLAS/LAPACK complex function interfaces | |||
| --------------------------------------- | |||
| For BLAS and LAPACK functions that return a complex number, there exist two | |||
| conflicting (FORTRAN compiler dependent) calling conventions: either the result | |||
| is returned as a `struct` of two floating point numbers or an additional first | |||
| argument with a pointer to such a `struct` is used. By default ReLAPACK uses | |||
| the former (which is what gfortran uses), but it can switch to the latter by | |||
| setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK | |||
| specific counterparts) to `1` in `config.h`. | |||
| **For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.** | |||
| (Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases | |||
| will segfault or return errors on the order of 1 or larger.) | |||
| BLAS extension `xgemmt` | |||
| ----------------------- | |||
| The LDL decompositions require a general matrix-matrix product that updates only | |||
| a triangular matrix called `xgemmt`. If the BLAS implementation linked against | |||
| provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`; | |||
| otherwise, ReLAPACK uses its own recursive implementation of these kernels. | |||
| `xgemmt` is provided by MKL. | |||
| Routine Selection | |||
| ----------------- | |||
| ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`). If the | |||
| corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to | |||
| `1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g., | |||
| `dgetrf_`). By default, wrappers for all routines are enabled. | |||
| Crossover Size | |||
| -------------- | |||
| The crossover size determines below which matrix sizes ReLAPACK's recursive | |||
| algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3 | |||
| routines. The crossover size is set in `config.h` and can be chosen either | |||
| globally for the entire library, by operation, or individually by routine. | |||
| Allowing Temporary Buffers | |||
| -------------------------- | |||
| Two of ReLAPACK's routines make use of temporary buffers, which are allocated | |||
| and freed within ReLAPACK. Setting `ALLOW_MALLOC` (or one of the routine | |||
| specific counterparts) to 0 in `config.h` will disable these buffers. The | |||
| affected routines are: | |||
| * `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2. As in | |||
| LAPACK, this size can be queried by setting `lWork = -1` and the passed | |||
| buffer will be used if it is large enough; only if it is not, a local buffer | |||
| will be allocated. | |||
| The advantage of this mechanism is that ReLAPACK will seamlessly work even | |||
| with codes that statically provide too little memory instead of breaking | |||
| them. | |||
| * `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem | |||
| to standard form can use an auxiliary buffer of size n^2 / 2 to avoid | |||
| redundant computations. It thereby performs about 30% less FLOPs than | |||
| LAPACK. | |||
| FORTRAN symbol names | |||
| -------------------- | |||
| ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces. | |||
| Since these libraries usually have an underscore to their symbol names, ReLAPACK | |||
| has configuration switches in `config.h` to adjust the corresponding routine | |||
| names. | |||
| @@ -0,0 +1,212 @@ | |||
| Coverage of ReLAPACK | |||
| ==================== | |||
| This file lists all LAPACK compute routines that are covered by recursive | |||
| algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which | |||
| are not (yet) part of ReLAPACK. | |||
| <!-- START doctoc generated TOC please keep comment here to allow auto update --> | |||
| <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE --> | |||
| **Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* | |||
| - [List of covered LAPACK routines](#list-of-covered-lapack-routines) | |||
| - [`xlauum`](#xlauum) | |||
| - [`xsygst`](#xsygst) | |||
| - [`xtrtri`](#xtrtri) | |||
| - [`xpotrf`](#xpotrf) | |||
| - [`xpbtrf`](#xpbtrf) | |||
| - [`xsytrf`](#xsytrf) | |||
| - [`xgetrf`](#xgetrf) | |||
| - [`xgbtrf`](#xgbtrf) | |||
| - [`xtrsyl`](#xtrsyl) | |||
| - [`xtgsyl`](#xtgsyl) | |||
| - [Covered BLAS extension](#covered-blas-extension) | |||
| - [`xgemmt`](#xgemmt) | |||
| - [Not covered yet](#not-covered-yet) | |||
| - [`xpstrf`](#xpstrf) | |||
| - [Not covered: extra FLOPs](#not-covered-extra-flops) | |||
| - [QR decomposition (and related)](#qr-decomposition-and-related) | |||
| - [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal) | |||
| - [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal) | |||
| - [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg) | |||
| <!-- END doctoc generated TOC please keep comment here to allow auto update --> | |||
| List of covered LAPACK routines | |||
| ------------------------------- | |||
| ### `xlauum` | |||
| Multiplication of a triangular matrix with its (complex conjugate) transpose, | |||
| resulting in a symmetric (Hermitian) matrix. | |||
| Routines: `slauum`, `dlauum`, `clauum`, `zlauum` | |||
| Operations: | |||
| * A = L^T L | |||
| * A = U U^T | |||
| ### `xsygst` | |||
| Simultaneous two-sided multiplication of a symmetric matrix with a triangular | |||
| matrix and its transpose | |||
| Routines: `ssygst`, `dsygst`, `chegst`, `zhegst` | |||
| Operations: | |||
| * A = inv(L) A inv(L^T) | |||
| * A = inv(U^T) A inv(U) | |||
| * A = L^T A L | |||
| * A = U A U^T | |||
| ### `xtrtri` | |||
| Inversion of a triangular matrix | |||
| Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri` | |||
| Operations: | |||
| * L = inv(L) | |||
| * U = inv(U) | |||
| ### `xpotrf` | |||
| Cholesky decomposition of a symmetric (Hermitian) positive definite matrix | |||
| Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf` | |||
| Operations: | |||
| * L L^T = A | |||
| * U^T U = A | |||
| ### `xpbtrf` | |||
| Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix | |||
| Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf` | |||
| Operations: | |||
| * L L^T = A | |||
| * U^T U = A | |||
| ### `xsytrf` | |||
| LDL decomposition of a symmetric (or Hermitian) matrix | |||
| Routines: | |||
| * `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`, | |||
| * `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`, | |||
| `zhetrf_rook` | |||
| Operations: | |||
| * L D L^T = A | |||
| * U^T D U = A | |||
| ### `xgetrf` | |||
| LU decomposition of a general matrix with pivoting | |||
| Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf` | |||
| Operation: P L U = A | |||
| ### `xgbtrf` | |||
| LU decomposition of a general banded matrix with pivoting | |||
| Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf` | |||
| Operation: L U = A | |||
| ### `xtrsyl` | |||
| Solution of the quasi-triangular Sylvester equation | |||
| Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl` | |||
| Operations: | |||
| * A X + B Y = C -> X | |||
| * A^T X + B Y = C -> X | |||
| * A X + B^T Y = C -> X | |||
| * A^T X + B^T Y = C -> X | |||
| * A X - B Y = C -> X | |||
| * A^T X - B Y = C -> X | |||
| * A X - B^T Y = C -> X | |||
| * A^T X - B^T Y = C -> X | |||
| ### `xtgsyl` | |||
| Solution of the generalized Sylvester equations | |||
| Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl` | |||
| Operations: | |||
| * A R - L B = C, D R - L E = F -> L, R | |||
| * A^T R + D^T L = C, R B^T - L E^T = -F -> L, R | |||
| Covered BLAS extension | |||
| ---------------------- | |||
| ### `xgemmt` | |||
| Matrix-matrix product updating only a triangular part of the result | |||
| Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt` | |||
| Operations: | |||
| * C = alpha A B + beta C | |||
| * C = alpha A B^T + beta C | |||
| * C = alpha A^T B + beta C | |||
| * C = alpha A^T B^T + beta C | |||
| Not covered yet | |||
| --------------- | |||
| The following operation is implemented as a blocked algorithm in LAPACK but | |||
| currently not yet covered in ReLAPACK as a recursive algorithm | |||
| ### `xpstrf` | |||
| Cholesky decomposition of a positive semi-definite matrix with complete pivoting. | |||
| Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf` | |||
| Operations: | |||
| * P L L^T P^T = A | |||
| * P U^T U P^T = A | |||
| Not covered: extra FLOPs | |||
| ------------------------ | |||
| The following routines are not covered because recursive variants would require | |||
| considerably more FLOPs or operate on banded matrices. | |||
| ### QR decomposition (and related) | |||
| Routines: | |||
| * `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf` | |||
| * `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf` | |||
| * `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf` | |||
| * `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf` | |||
| * `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf` | |||
| Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A | |||
| Routines for multiplication with Q: | |||
| * `sormqr`, `dormqr`, `cunmqr`, `zunmqr` | |||
| * `sormrq`, `dormrq`, `cunmrq`, `zunmrq` | |||
| * `sormql`, `dormql`, `cunmql`, `zunmql` | |||
| * `sormlq`, `dormlq`, `cunmlq`, `zunmlq` | |||
| * `sormrz`, `dormrz`, `cunmrz`, `zunmrz` | |||
| Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T | |||
| Routines for construction of Q: | |||
| * `sorgqr`, `dorgqr`, `cungqr`, `zungqr` | |||
| * `sorgrq`, `dorgrq`, `cungrq`, `zungrq` | |||
| * `sorgql`, `dorgql`, `cungql`, `zungql` | |||
| * `sorglq`, `dorglq`, `cunglq`, `zunglq` | |||
| ### Symmetric reduction to tridiagonal | |||
| Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd` | |||
| Operation: Q T Q^T = A | |||
| ### Symmetric reduction to bidiagonal | |||
| Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd` | |||
| Operation: Q T P^T = A | |||
| ### Reduction to upper Hessenberg | |||
| Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd` | |||
| Operation: Q H Q^T = A | |||
| @@ -0,0 +1,67 @@ | |||
| #ifndef RELAPACK_H | |||
| #define RELAPACK_H | |||
| void RELAPACK_slauum(const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_clauum(const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *); | |||
| void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *); | |||
| void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *); | |||
| void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *); | |||
| void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *); | |||
| void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *); | |||
| void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *); | |||
| void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *); | |||
| void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
| void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
| void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
| void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
| void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
| void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
| void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
| void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
| void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
| void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
| void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
| void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
| void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); | |||
| void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); | |||
| void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); | |||
| void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); | |||
| void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| #endif /* RELAPACK_H */ | |||
| @@ -0,0 +1,61 @@ | |||
| #ifndef BLAS_H | |||
| #define BLAS_H | |||
| extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *); | |||
| extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *); | |||
| extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *); | |||
| extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *); | |||
| extern void BLAS(sscal)(const int *, const float *, float *, const int *); | |||
| extern void BLAS(dscal)(const int *, const double *, double *, const int *); | |||
| extern void BLAS(cscal)(const int *, const float *, float *, const int *); | |||
| extern void BLAS(zscal)(const int *, const double *, double *, const int *); | |||
| extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| #if HAVE_XGEMMT | |||
| extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| #endif | |||
| #endif /* BLAS_H */ | |||
| @@ -0,0 +1,230 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, | |||
| const int *, float *, const int *, int *, float *, const int *, float *, | |||
| const int *, int *); | |||
| /** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's cgbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_cgbtrf( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kl < 0) | |||
| *info = -3; | |||
| else if (*ku < 0) | |||
| *info = -4; | |||
| else if (*ldAb < 2 * *kl + *ku + 1) | |||
| *info = -6; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CGBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Constant | |||
| const float ZERO[] = { 0., 0. }; | |||
| // Result upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| float *const A = Ab + 2 * kv; | |||
| // Zero upper diagonal fill-in elements | |||
| int i, j; | |||
| for (j = 0; j < *n; j++) { | |||
| float *const A_j = A + 2 * *ldA * j; | |||
| for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
| A_j[2 * i] = A_j[2 * i + 1] = 0.; | |||
| } | |||
| // Allocate work space | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
| const int nWorkl = (kv > n1) ? n1 : kv; | |||
| const int mWorku = (*kl > n1) ? n1 : *kl; | |||
| const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
| float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); | |||
| float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); | |||
| LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
| LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
| // Recursive kernel | |||
| RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
| // Free work space | |||
| free(Workl); | |||
| free(Worku); | |||
| } | |||
| /** cgbtrf's recursive compute kernel */ | |||
| static void RELAPACK_cgbtrf_rec( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterators | |||
| int i, j; | |||
| // Output upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| float *const A = Ab + 2 * kv; | |||
| // Splitting | |||
| const int n1 = MIN(CREC_SPLIT(*n), *kl); | |||
| const int n2 = *n - n1; | |||
| const int m1 = MIN(n1, *m); | |||
| const int m2 = *m - m1; | |||
| const int mn1 = MIN(m1, n1); | |||
| const int mn2 = MIN(m2, n2); | |||
| // Ab_L * | |||
| // Ab_BR | |||
| float *const Ab_L = Ab; | |||
| float *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
| // A_L A_R | |||
| float *const A_L = A; | |||
| float *const A_R = A + 2 * *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * m1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * m1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, kv - n1); | |||
| const int n22 = MIN(n2 - n21, n1); | |||
| const int m21 = MIN(m2, *kl - m1); | |||
| const int m22 = MIN(m2 - m21, m1); | |||
| // n1 n21 n22 | |||
| // m * A_Rl ARr | |||
| float *const A_Rl = A_R; | |||
| float *const A_Rr = A_R + 2 * *ldA * n21; | |||
| // n1 n21 n22 | |||
| // m1 * A_TRl A_TRr | |||
| // m21 A_BLt A_BRtl A_BRtr | |||
| // m22 A_BLb A_BRbl A_BRbr | |||
| float *const A_TRl = A_TR; | |||
| float *const A_TRr = A_TR + 2 * *ldA * n21; | |||
| float *const A_BLt = A_BL; | |||
| float *const A_BLb = A_BL + 2 * m21; | |||
| float *const A_BRtl = A_BR; | |||
| float *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
| float *const A_BRbl = A_BR + 2 * m21; | |||
| float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21; | |||
| // recursion(Ab_L, ipiv_T) | |||
| RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
| // Workl = A_BLb | |||
| LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
| // partially redo swaps in A_L | |||
| for (i = 0; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
| else | |||
| BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
| } | |||
| } | |||
| // apply pivots to A_Rl | |||
| LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
| // apply pivots to A_Rr columnwise | |||
| for (j = 0; j < n22; j++) { | |||
| float *const A_Rrj = A_Rr + 2 * *ldA * j; | |||
| for (i = j; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| const float tmpr = A_Rrj[2 * i]; | |||
| const float tmpc = A_Rrj[2 * i + 1]; | |||
| A_Rrj[2 * i] = A_Rrj[2 * ip]; | |||
| A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1]; | |||
| A_Rrj[2 * ip] = tmpr; | |||
| A_Rrj[2 * ip + 1] = tmpc; | |||
| } | |||
| } | |||
| } | |||
| // A_TRl = A_TL \ A_TRl | |||
| BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // Worku = A_TRr | |||
| LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
| // Worku = A_TL \ Worku | |||
| BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
| // A_TRr = Worku | |||
| LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_TRl | |||
| BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // A_BRbl = A_BRbl - Workl * A_TRl | |||
| BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRtr = A_BRtr - A_BLt * Worku | |||
| BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Workl * Worku | |||
| BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
| // partially undo swaps in A_L | |||
| for (i = mn1 - 1; i >= 0; i--) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
| else | |||
| BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
| } | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| for (i = 0; i < mn2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,167 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_cgemmt_rec(const char *, const char *, const char *, | |||
| const int *, const int *, const float *, const float *, const int *, | |||
| const float *, const int *, const float *, float *, const int *); | |||
| static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, | |||
| const int *, const int *, const float *, const float *, const int *, | |||
| const float *, const int *, const float *, float *, const int *); | |||
| /** CGEMMT computes a matrix-matrix product with general matrices but updates | |||
| * only the upper or lower triangular part of the result matrix. | |||
| * | |||
| * This routine performs the same operation as the BLAS routine | |||
| * cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
| * but only updates the triangular part of C specified by uplo: | |||
| * If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
| * otherwise the upper triangular part is updated. | |||
| * */ | |||
| void RELAPACK_cgemmt( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| #if HAVE_XGEMMT | |||
| BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| #else | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int notransA = LAPACK(lsame)(transA, "N"); | |||
| const int tranA = LAPACK(lsame)(transA, "T"); | |||
| const int ctransA = LAPACK(lsame)(transA, "C"); | |||
| const int notransB = LAPACK(lsame)(transB, "N"); | |||
| const int tranB = LAPACK(lsame)(transB, "T"); | |||
| const int ctransB = LAPACK(lsame)(transB, "C"); | |||
| int info = 0; | |||
| if (!lower && !upper) | |||
| info = 1; | |||
| else if (!tranA && !ctransA && !notransA) | |||
| info = 2; | |||
| else if (!tranB && !ctransB && !notransB) | |||
| info = 3; | |||
| else if (*n < 0) | |||
| info = 4; | |||
| else if (*k < 0) | |||
| info = 5; | |||
| else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
| info = 8; | |||
| else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
| info = 10; | |||
| else if (*ldC < MAX(1, *n)) | |||
| info = 13; | |||
| if (info) { | |||
| LAPACK(xerbla)("CGEMMT", &info); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C'); | |||
| const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C'); | |||
| // Recursive kernel | |||
| RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| #endif | |||
| } | |||
| /** cgemmt's recursive compute kernel */ | |||
| static void RELAPACK_cgemmt_rec( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CGEMMT, 1)) { | |||
| // Unblocked | |||
| RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| } | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_T | |||
| // A_B | |||
| const float *const A_T = A; | |||
| const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1); | |||
| // B_L B_R | |||
| const float *const B_L = B; | |||
| const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1); | |||
| // C_TL C_TR | |||
| // C_BL C_BR | |||
| float *const C_TL = C; | |||
| float *const C_TR = C + 2 * *ldC * n1; | |||
| float *const C_BL = C + 2 * n1; | |||
| float *const C_BR = C + 2 * *ldC * n1 + 2 * n1; | |||
| // recursion(C_TL) | |||
| RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
| if (*uplo == 'L') | |||
| // C_BL = alpha A_B B_L + beta C_BL | |||
| BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
| else | |||
| // C_TR = alpha A_T B_R + beta C_TR | |||
| BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
| // recursion(C_BR) | |||
| RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
| } | |||
| /** cgemmt's unblocked compute kernel */ | |||
| static void RELAPACK_cgemmt_rec2( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| const int incB = (*transB == 'N') ? 1 : *ldB; | |||
| const int incC = 1; | |||
| int i; | |||
| for (i = 0; i < *n; i++) { | |||
| // A_0 | |||
| // A_i | |||
| const float *const A_0 = A; | |||
| const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i); | |||
| // * B_i * | |||
| const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i); | |||
| // * C_0i * | |||
| // * C_ii * | |||
| float *const C_0i = C + 2 * *ldC * i; | |||
| float *const C_ii = C + 2 * *ldC * i + 2 * i; | |||
| if (*uplo == 'L') { | |||
| const int nmi = *n - i; | |||
| if (*transA == 'N') | |||
| BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| else | |||
| BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| } else { | |||
| const int ip1 = i + 1; | |||
| if (*transA == 'N') | |||
| BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| else | |||
| BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_cgetrf_rec(const int *, const int *, float *, | |||
| const int *, int *, int *); | |||
| /** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's cgetrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html | |||
| */ | |||
| void RELAPACK_cgetrf( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CGETRF", &minfo); | |||
| return; | |||
| } | |||
| const int sn = MIN(*m, *n); | |||
| RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
| // Right remainder | |||
| if (*m < *n) { | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Splitting | |||
| const int rn = *n - *m; | |||
| // A_L A_R | |||
| const float *const A_L = A; | |||
| float *const A_R = A + 2 * *ldA * *m; | |||
| // A_R = apply(ipiv, A_R) | |||
| LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
| // A_R = A_L \ A_R | |||
| BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
| } | |||
| } | |||
| /** cgetrf's recursive compute kernel */ | |||
| static void RELAPACK_cgetrf_rec( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CGETRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(cgetf2)(m, n, A, ldA, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| const int m2 = *m - n1; | |||
| // A_L A_R | |||
| float *const A_L = A; | |||
| float *const A_R = A + 2 * *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // recursion(A_L, ipiv_T) | |||
| RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
| // apply pivots to A_R | |||
| LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
| // A_TR = A_TL \ A_TR | |||
| BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_BL * A_TR | |||
| BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
| // recursion(A_BR, ipiv_B) | |||
| RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // apply pivots to A_BL | |||
| LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,212 @@ | |||
| #include "relapack.h" | |||
| #if XSYGST_ALLOW_MALLOC | |||
| #include "stdlib.h" | |||
| #endif | |||
| static void RELAPACK_chegst_rec(const int *, const char *, const int *, | |||
| float *, const int *, const float *, const int *, | |||
| float *, const int *, int *); | |||
| /** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's chegst. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html | |||
| * */ | |||
| void RELAPACK_chegst( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (*itype < 1 || *itype > 3) | |||
| *info = -1; | |||
| else if (!lower && !upper) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -7; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CHEGST", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Allocate work space | |||
| float *Work = NULL; | |||
| int lWork = 0; | |||
| #if XSYGST_ALLOW_MALLOC | |||
| const int n1 = CREC_SPLIT(*n); | |||
| lWork = n1 * (*n - n1); | |||
| Work = malloc(lWork * 2 * sizeof(float)); | |||
| if (!Work) | |||
| lWork = 0; | |||
| #endif | |||
| // recursive kernel | |||
| RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); | |||
| // Free work space | |||
| #if XSYGST_ALLOW_MALLOC | |||
| if (Work) | |||
| free(Work); | |||
| #endif | |||
| } | |||
| /** chegst's recursive compute kernel */ | |||
| static void RELAPACK_chegst_rec( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CHEGST, 1)) { | |||
| // Unblocked | |||
| LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ZERO[] = { 0., 0. }; | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const float HALF[] = { .5, 0. }; | |||
| const float MHALF[] = { -.5, 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // B_TL B_TR | |||
| // B_BL B_BR | |||
| const float *const B_TL = B; | |||
| const float *const B_TR = B + 2 * *ldB * n1; | |||
| const float *const B_BL = B + 2 * n1; | |||
| const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
| // recursion(A_TL, B_TL) | |||
| RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); | |||
| if (*itype == 1) | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / B_TL' | |||
| BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = -1/2 * B_BL * A_TL | |||
| BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' | |||
| BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR \ A_BL | |||
| BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL' \ A_TR | |||
| BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = -1/2 * A_TL * B_TR | |||
| BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR | |||
| BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR / B_BR | |||
| BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| else | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL * B_TL | |||
| BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = 1/2 * A_BR * B_BL | |||
| BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL | |||
| BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR * A_BL | |||
| BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL * A_TR | |||
| BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = 1/2 * B_TR * A_BR | |||
| BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR + 1/2 B_TR A_BR | |||
| BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' | |||
| BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR + 1/2 B_TR * A_BR | |||
| BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR * B_BR | |||
| BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR, B_BR) | |||
| RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's chetrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html | |||
| * */ | |||
| void RELAPACK_chetrf( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CHETRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** chetrf's recursive compute kernel */ | |||
| static void RELAPACK_chetrf_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_CHETRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = CREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| float *const A_BL_B = A + 2 * *n; | |||
| float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + 2 * n1; | |||
| float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = CREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,520 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static complex c_b1 = {1.f,0.f}; | |||
| static int c__1 = 1; | |||
| /** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method | |||
| * | |||
| * This routine is a minor modification of LAPACK's clahef. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int * | |||
| nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, | |||
| int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| float r__1, r__2, r__3, r__4; | |||
| complex q__1, q__2, q__3, q__4; | |||
| /* Builtin functions */ | |||
| double sqrt(double), r_imag(complex *); | |||
| void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k; | |||
| static float t, r1; | |||
| static complex d11, d21, d22; | |||
| static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
| static float alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
| , complex *, int *, complex *, int *, complex *, complex * | |||
| , int *, ftnlen), ccopy_(int *, complex *, int *, | |||
| complex *, int *), cswap_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int kstep; | |||
| static float absakk; | |||
| extern /* Subroutine */ int clacgv_(int *, complex *, int *); | |||
| extern int icamax_(int *, complex *, int *); | |||
| extern /* Subroutine */ int csscal_(int *, float *, complex *, int | |||
| *); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| i__1 = k - 1; | |||
| ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + kw * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - 1; | |||
| ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| i__1 = k - imax; | |||
| ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| i__1 = k - imax; | |||
| clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + (kw - 1) * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| i__1 = k - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
| &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
| r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( | |||
| r__2)); | |||
| rowmax = dmax(r__3,r__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { | |||
| kp = imax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = kk - 1 - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kk - 1 - kp; | |||
| clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| i__1 = k + k * a_dim1; | |||
| r1 = 1.f / a[i__1].r; | |||
| i__1 = k - 1; | |||
| csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| i__1 = k - 1; | |||
| clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| r_cnjg(&q__2, &d21); | |||
| c_div(&q__1, &w[k + kw * w_dim1], &q__2); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1.f / (q__1.r - 1.f); | |||
| q__2.r = t, q__2.i = 0.f; | |||
| c_div(&q__1, &q__2, &d21); | |||
| d21.r = q__1.r, d21.i = q__1.i; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| r_cnjg(&q__2, &d21); | |||
| i__3 = j + kw * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = | |||
| q__2.r * q__3.i + q__2.i * q__3.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1; | |||
| clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = k - 2; | |||
| clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| ++j; | |||
| } | |||
| ++j; | |||
| if (jp != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
| } | |||
| if (j <= *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k | |||
| + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + k * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - k; | |||
| ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = imax - k; | |||
| clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
| imax + 1 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], | |||
| lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
| w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + (k + 1) * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
| ; | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
| r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( | |||
| r__2)); | |||
| rowmax = dmax(r__3,r__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
| k * w_dim1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = kp - kk - 1; | |||
| ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kp - kk - 1; | |||
| clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| i__1 = k + k * a_dim1; | |||
| r1 = 1.f / a[i__1].r; | |||
| i__1 = *n - k; | |||
| csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| i__1 = *n - k; | |||
| clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| r_cnjg(&q__2, &d21); | |||
| c_div(&q__1, &w[k + k * w_dim1], &q__2); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1.f / (q__1.r - 1.f); | |||
| q__2.r = t, q__2.i = 0.f; | |||
| c_div(&q__1, &q__2, &d21); | |||
| d21.r = q__1.r, d21.i = q__1.i; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| r_cnjg(&q__2, &d21); | |||
| i__3 = j + k * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = | |||
| q__2.r * q__3.i + q__2.i * q__3.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = *n - k; | |||
| clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = *n - k - 1; | |||
| clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| --j; | |||
| } | |||
| --j; | |||
| if (jp != jj && j >= 1) { | |||
| cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j >= 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's chetrf_rook. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html | |||
| * */ | |||
| void RELAPACK_chetrf_rook( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CHETRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** chetrf_rook's recursive compute kernel */ | |||
| static void RELAPACK_chetrf_rook_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_CHETRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = CREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| float *const A_BL_B = A + 2 * *n; | |||
| float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + 2 * n1; | |||
| float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = CREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,661 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static complex c_b1 = {1.f,0.f}; | |||
| static int c__1 = 1; | |||
| /** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method | |||
| * | |||
| * This routine is a minor modification of LAPACK's clahef_rook. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n, | |||
| int *nb, int *kb, complex *a, int *lda, int *ipiv, | |||
| complex *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| float r__1, r__2; | |||
| complex q__1, q__2, q__3, q__4, q__5; | |||
| /* Builtin functions */ | |||
| double sqrt(double), r_imag(complex *); | |||
| void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k, p; | |||
| static float t, r1; | |||
| static complex d11, d21, d22; | |||
| static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
| static logical done; | |||
| static int imax, jmax; | |||
| static float alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
| , complex *, int *, complex *, int *, complex *, complex * | |||
| , int *, ftnlen); | |||
| static float sfmin; | |||
| extern /* Subroutine */ int ccopy_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int itemp; | |||
| extern /* Subroutine */ int cswap_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int kstep; | |||
| static float stemp, absakk; | |||
| extern /* Subroutine */ int clacgv_(int *, complex *, int *); | |||
| extern int icamax_(int *, complex *, int *); | |||
| extern double slamch_(char *, ftnlen); | |||
| extern /* Subroutine */ int csscal_(int *, float *, complex *, int | |||
| *); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| sfmin = slamch_("S", (ftnlen)1); | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], & | |||
| c__1); | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + kw * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], | |||
| &c__1); | |||
| } | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L12: | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| i__1 = k - imax; | |||
| ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| i__1 = k - imax; | |||
| clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + (kw - 1) * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| if (imax != k) { | |||
| i__1 = k - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| i__1 = itemp + (kw - 1) * w_dim1; | |||
| stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { | |||
| kp = imax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L12; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p + p * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = k - 1 - p; | |||
| ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
| a_dim1], lda); | |||
| i__1 = k - 1 - p; | |||
| clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda); | |||
| if (p > 1) { | |||
| i__1 = p - 1; | |||
| ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + | |||
| 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k + | |||
| 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
| ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = kk - 1 - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kk - 1 - kp; | |||
| clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| i__1 = k + k * a_dim1; | |||
| t = a[i__1].r; | |||
| if (dabs(t) >= sfmin) { | |||
| r1 = 1.f / t; | |||
| i__1 = k - 1; | |||
| csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| i__1 = k - 1; | |||
| for (ii = 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| i__3 = ii + k * a_dim1; | |||
| q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L14: */ | |||
| } | |||
| } | |||
| i__1 = k - 1; | |||
| clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| r_cnjg(&q__2, &d21); | |||
| c_div(&q__1, &w[k + kw * w_dim1], &q__2); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1.f / (q__1.r - 1.f); | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d21); | |||
| q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + kw * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| r_cnjg(&q__5, &d21); | |||
| c_div(&q__2, &q__3, &q__5); | |||
| q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1; | |||
| clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = k - 2; | |||
| clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| ++j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| ++j; | |||
| if (jp2 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| ++jj; | |||
| if (kstep == 2 && jp1 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| if (j < *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & | |||
| w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
| ftnlen)12); | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + k * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k * | |||
| a_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L72: | |||
| i__1 = imax - k; | |||
| ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = imax - k; | |||
| clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
| imax + 1 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] | |||
| , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
| 1) * w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + (k + 1) * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| if (imax != k) { | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
| c__1); | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = itemp + (k + 1) * w_dim1; | |||
| stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[itemp + (k + 1) * w_dim1]), dabs(r__2)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L72; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p + p * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = p - k - 1; | |||
| ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) * | |||
| a_dim1], lda); | |||
| i__1 = p - k - 1; | |||
| clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda); | |||
| if (p < *n) { | |||
| i__1 = *n - p; | |||
| ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p | |||
| * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
| } | |||
| cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = kp - kk - 1; | |||
| ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kp - kk - 1; | |||
| clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| i__1 = k + k * a_dim1; | |||
| t = a[i__1].r; | |||
| if (dabs(t) >= sfmin) { | |||
| r1 = 1.f / t; | |||
| i__1 = *n - k; | |||
| csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } else { | |||
| i__1 = *n; | |||
| for (ii = k + 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| i__3 = ii + k * a_dim1; | |||
| q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L74: */ | |||
| } | |||
| } | |||
| i__1 = *n - k; | |||
| clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| r_cnjg(&q__2, &d21); | |||
| c_div(&q__1, &w[k + k * w_dim1], &q__2); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1.f / (q__1.r - 1.f); | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + k * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| r_cnjg(&q__5, &d21); | |||
| c_div(&q__2, &q__3, &q__5); | |||
| q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d21); | |||
| q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = *n - k; | |||
| clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = *n - k - 1; | |||
| clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| --j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| --j; | |||
| if (jp2 != jj && j >= 1) { | |||
| cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| --jj; | |||
| if (kstep == 2 && jp1 != jj && j >= 1) { | |||
| cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j > 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,87 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_clauum_rec(const char *, const int *, float *, | |||
| const int *, int *); | |||
| /** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's clauum. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html | |||
| * */ | |||
| void RELAPACK_clauum( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CLAUUM", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** clauum's recursive compute kernel */ | |||
| static void RELAPACK_clauum_rec( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CLAUUM, 1)) { | |||
| // Unblocked | |||
| LAPACK(clauu2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*uplo == 'L') { | |||
| // A_TL = A_TL + A_BL' * A_BL | |||
| BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
| // A_BL = A_BR' * A_BL | |||
| BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TL = A_TL + A_TR * A_TR' | |||
| BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
| // A_TR = A_TR * A_BR' | |||
| BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info); | |||
| } | |||
| @@ -0,0 +1,157 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, | |||
| float *, const int *, float *, const int *, int *); | |||
| /** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's cpbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_cpbtrf( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kd < 0) | |||
| *info = -3; | |||
| else if (*ldAb < *kd + 1) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CPBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Constant | |||
| const float ZERO[] = { 0., 0. }; | |||
| // Allocate work space | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
| const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
| float *Work = malloc(mWork * nWork * 2 * sizeof(float)); | |||
| LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
| // Recursive kernel | |||
| RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
| // Free work space | |||
| free(Work); | |||
| } | |||
| /** cpbtrf's recursive compute kernel */ | |||
| static void RELAPACK_cpbtrf_rec( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| float *Work, const int *ldWork, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); | |||
| // Splitting | |||
| const int n1 = MIN(CREC_SPLIT(*n), *kd); | |||
| const int n2 = *n - n1; | |||
| // * * | |||
| // * Ab_BR | |||
| float *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, *kd - n1); | |||
| const int n22 = MIN(n2 - n21, *kd); | |||
| // n1 n21 n22 | |||
| // n1 * A_TRl A_TRr | |||
| // n21 A_BLt A_BRtl A_BRtr | |||
| // n22 A_BLb A_BRbl A_BRbr | |||
| float *const A_TRl = A_TR; | |||
| float *const A_TRr = A_TR + 2 * *ldA * n21; | |||
| float *const A_BLt = A_BL; | |||
| float *const A_BLb = A_BL + 2 * n21; | |||
| float *const A_BRtl = A_BR; | |||
| float *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
| float *const A_BRbl = A_BR + 2 * n21; | |||
| float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21; | |||
| if (*uplo == 'L') { | |||
| // A_BLt = ABLt / A_TL' | |||
| BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
| BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_BLb | |||
| LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
| // Work = Work / A_TL' | |||
| BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRbl = A_BRbl - Work * A_BLt' | |||
| BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRbr = A_BRbr - Work * Work' | |||
| BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_BLb = Work | |||
| LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
| } else { | |||
| // A_TRl = A_TL' \ A_TRl | |||
| BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
| BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_TRr | |||
| LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
| // Work = A_TL' \ Work | |||
| BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRtr = A_BRtr - A_TRl' * Work | |||
| BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Work' * Work | |||
| BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_TRr = Work | |||
| LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| if (*kd > n1) | |||
| RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info); | |||
| else | |||
| RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,92 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_cpotrf_rec(const char *, const int *, float *, | |||
| const int *, int *); | |||
| /** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's cpotrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html | |||
| * */ | |||
| void RELAPACK_cpotrf( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CPOTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** cpotrf's recursive compute kernel */ | |||
| static void RELAPACK_cpotrf_rec( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(cpotf2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / A_TL' | |||
| BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * A_BL' | |||
| BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
| } else { | |||
| // A_TR = A_TL' \ A_TR | |||
| BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * A_TR | |||
| BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,238 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's csytrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html | |||
| * */ | |||
| void RELAPACK_csytrf( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy arguments | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** csytrf's recursive compute kernel */ | |||
| static void RELAPACK_csytrf_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_CSYTRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = CREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| float *const A_BL_B = A + 2 * *n; | |||
| float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + 2 * n1; | |||
| float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = CREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,451 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static complex c_b1 = {1.f,0.f}; | |||
| static int c__1 = 1; | |||
| /** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's clasyf. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int * | |||
| nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, | |||
| int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| float r__1, r__2, r__3, r__4; | |||
| complex q__1, q__2, q__3; | |||
| /* Builtin functions */ | |||
| double sqrt(double), r_imag(complex *); | |||
| void c_div(complex *, complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k; | |||
| static complex t, r1, d11, d21, d22; | |||
| static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
| static float alpha; | |||
| extern /* Subroutine */ int cscal_(int *, complex *, complex *, | |||
| int *); | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
| , complex *, int *, complex *, int *, complex *, complex * | |||
| , int *, ftnlen), ccopy_(int *, complex *, int *, | |||
| complex *, int *), cswap_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int kstep; | |||
| static float absakk; | |||
| extern int icamax_(int *, complex *, int *); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| kstep = 1; | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * | |||
| w_dim1]), dabs(r__2)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + kw * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| i__1 = k - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
| &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
| r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( | |||
| r__2)); | |||
| rowmax = dmax(r__3,r__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha * | |||
| rowmax) { | |||
| kp = imax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = kk - 1 - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = q__1.r, r1.i = q__1.i; | |||
| i__1 = k - 1; | |||
| cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + kw * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
| c_div(&q__1, &c_b1, &q__2); | |||
| t.r = q__1.r, t.i = q__1.i; | |||
| c_div(&q__1, &t, &d21); | |||
| d21.r = q__1.r, d21.i = q__1.i; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + kw * w_dim1; | |||
| q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| ++j; | |||
| } | |||
| ++j; | |||
| if (jp != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
| } | |||
| if (j < *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k | |||
| + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
| kstep = 1; | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * | |||
| w_dim1]), dabs(r__2)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + k * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - k; | |||
| ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], | |||
| lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
| w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
| ; | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
| r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( | |||
| r__2)); | |||
| rowmax = dmax(r__3,r__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha * | |||
| rowmax) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
| k * w_dim1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = kp - kk - 1; | |||
| ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = q__1.r, r1.i = q__1.i; | |||
| i__1 = *n - k; | |||
| cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k + k * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
| c_div(&q__1, &c_b1, &q__2); | |||
| t.r = q__1.r, t.i = q__1.i; | |||
| c_div(&q__1, &t, &d21); | |||
| d21.r = q__1.r, d21.i = q__1.i; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + k * w_dim1; | |||
| q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| --j; | |||
| } | |||
| --j; | |||
| if (jp != jj && j >= 1) { | |||
| cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j > 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's csytrf_rook. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html | |||
| * */ | |||
| void RELAPACK_csytrf_rook( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** csytrf_rook's recursive compute kernel */ | |||
| static void RELAPACK_csytrf_rook_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = CREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| float *const A_BL_B = A + 2 * *n; | |||
| float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + 2 * n1; | |||
| float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = CREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,565 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static complex c_b1 = {1.f,0.f}; | |||
| static int c__1 = 1; | |||
| /** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's clasyf_rook. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n, | |||
| int *nb, int *kb, complex *a, int *lda, int *ipiv, | |||
| complex *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| float r__1, r__2; | |||
| complex q__1, q__2, q__3, q__4; | |||
| /* Builtin functions */ | |||
| double sqrt(double), r_imag(complex *); | |||
| void c_div(complex *, complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k, p; | |||
| static complex t, r1, d11, d12, d21, d22; | |||
| static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
| static logical done; | |||
| static int imax, jmax; | |||
| static float alpha; | |||
| extern /* Subroutine */ int cscal_(int *, complex *, complex *, | |||
| int *); | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
| , complex *, int *, complex *, int *, complex *, complex * | |||
| , int *, ftnlen); | |||
| static float sfmin; | |||
| extern /* Subroutine */ int ccopy_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int itemp; | |||
| extern /* Subroutine */ int cswap_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int kstep; | |||
| static float stemp, absakk; | |||
| extern int icamax_(int *, complex *, int *); | |||
| extern double slamch_(char *, ftnlen); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| sfmin = slamch_("S", (ftnlen)1); | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * | |||
| w_dim1]), dabs(r__2)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + kw * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L12: | |||
| ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = k - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| i__1 = itemp + (kw - 1) * w_dim1; | |||
| stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha * | |||
| rowmax)) { | |||
| kp = imax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L12; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = k - p; | |||
| ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
| a_dim1], lda); | |||
| ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - k + 1; | |||
| cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
| ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + k * a_dim1; | |||
| i__2 = kk + k * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = k - 1 - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| i__1 = k + k * a_dim1; | |||
| if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k | |||
| + k * a_dim1]), dabs(r__2)) >= sfmin) { | |||
| c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = q__1.r, r1.i = q__1.i; | |||
| i__1 = k - 1; | |||
| cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = k + k * a_dim1; | |||
| if (a[i__1].r != 0.f || a[i__1].i != 0.f) { | |||
| i__1 = k - 1; | |||
| for (ii = 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * | |||
| a_dim1]); | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L14: */ | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d12.r = w[i__1].r, d12.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + kw * w_dim1], &d12); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
| c_div(&q__1, &c_b1, &q__2); | |||
| t.r = q__1.r, t.i = q__1.i; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d12); | |||
| q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
| q__2.i + t.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + kw * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d12); | |||
| q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
| q__2.i + t.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| ++j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| ++j; | |||
| if (jp2 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| jj = j - 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| if (j <= *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & | |||
| w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * | |||
| w_dim1]), dabs(r__2)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + k * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L72: | |||
| i__1 = imax - k; | |||
| ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] | |||
| , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
| 1) * w_dim1], &c__1, (ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
| c__1); | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = itemp + (k + 1) * w_dim1; | |||
| stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[itemp + (k + 1) * w_dim1]), dabs(r__2)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha * | |||
| rowmax)) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L72; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p - k; | |||
| ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - p + 1; | |||
| ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & | |||
| c__1); | |||
| cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
| cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + k * a_dim1; | |||
| i__2 = kk + k * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = kp - k - 1; | |||
| ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) | |||
| * a_dim1], lda); | |||
| i__1 = *n - kp + 1; | |||
| ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * | |||
| a_dim1], &c__1); | |||
| cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| i__1 = k + k * a_dim1; | |||
| if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k | |||
| + k * a_dim1]), dabs(r__2)) >= sfmin) { | |||
| c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = q__1.r, r1.i = q__1.i; | |||
| i__1 = *n - k; | |||
| cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = k + k * a_dim1; | |||
| if (a[i__1].r != 0.f || a[i__1].i != 0.f) { | |||
| i__1 = *n; | |||
| for (ii = k + 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * | |||
| a_dim1]); | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L74: */ | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k + k * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
| c_div(&q__1, &c_b1, &q__2); | |||
| t.r = q__1.r, t.i = q__1.i; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + k * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d21); | |||
| q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
| q__2.i + t.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d21); | |||
| q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
| q__2.i + t.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| --j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| --j; | |||
| if (jp2 != jj && j >= 1) { | |||
| cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| jj = j + 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j >= 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,268 @@ | |||
| #include "relapack.h" | |||
| #include <math.h> | |||
| static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, | |||
| const int *, const float *, const int *, const float *, const int *, | |||
| float *, const int *, const float *, const int *, const float *, | |||
| const int *, float *, const int *, float *, float *, float *, int *); | |||
| /** CTGSYL solves the generalized Sylvester equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ctgsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html | |||
| * */ | |||
| void RELAPACK_ctgsyl( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dif, | |||
| float *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| // Parse arguments | |||
| const int notran = LAPACK(lsame)(trans, "N"); | |||
| const int tran = LAPACK(lsame)(trans, "C"); | |||
| // Compute work buffer size | |||
| int lwmin = 1; | |||
| if (notran && (*ijob == 1 || *ijob == 2)) | |||
| lwmin = MAX(1, 2 * *m * *n); | |||
| *info = 0; | |||
| // Check arguments | |||
| if (!tran && !notran) | |||
| *info = -1; | |||
| else if (notran && (*ijob < 0 || *ijob > 4)) | |||
| *info = -2; | |||
| else if (*m <= 0) | |||
| *info = -3; | |||
| else if (*n <= 0) | |||
| *info = -4; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -6; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -8; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -10; | |||
| else if (*ldD < MAX(1, *m)) | |||
| *info = -12; | |||
| else if (*ldE < MAX(1, *n)) | |||
| *info = -14; | |||
| else if (*ldF < MAX(1, *m)) | |||
| *info = -16; | |||
| else if (*lWork < lwmin && *lWork != -1) | |||
| *info = -20; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CTGSYL", &minfo); | |||
| return; | |||
| } | |||
| if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = lwmin; | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantrans = notran ? 'N' : 'C'; | |||
| // Constant | |||
| const float ZERO[] = { 0., 0. }; | |||
| int isolve = 1; | |||
| int ifunc = 0; | |||
| if (notran) { | |||
| if (*ijob >= 3) { | |||
| ifunc = *ijob - 2; | |||
| LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else if (*ijob >= 1) | |||
| isolve = 2; | |||
| } | |||
| float scale2; | |||
| int iround; | |||
| for (iround = 1; iround <= isolve; iround++) { | |||
| *scale = 1; | |||
| float dscale = 0; | |||
| float dsum = 1; | |||
| RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info); | |||
| if (dscale != 0) { | |||
| if (*ijob == 1 || *ijob == 3) | |||
| *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); | |||
| else | |||
| *dif = sqrt(*m * *n) / (dscale * sqrt(dsum)); | |||
| } | |||
| if (isolve == 2) { | |||
| if (iround == 1) { | |||
| if (notran) | |||
| ifunc = *ijob; | |||
| scale2 = *scale; | |||
| LAPACK(clacpy)("F", m, n, C, ldC, Work, m); | |||
| LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m); | |||
| LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else { | |||
| LAPACK(clacpy)("F", m, n, Work, m, C, ldC); | |||
| LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF); | |||
| *scale = scale2; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| /** ctgsyl's recursive vompute kernel */ | |||
| static void RELAPACK_ctgsyl_rec( | |||
| const char *trans, const int *ifunc, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dsum, float *dscale, | |||
| int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) { | |||
| // Unblocked | |||
| LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| float scale1[] = { 1., 0. }; | |||
| float scale2[] = { 1., 0. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| const int m1 = CREC_SPLIT(*m); | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const float *const A_TL = A; | |||
| const float *const A_TR = A + 2 * *ldA * m1; | |||
| const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
| // C_T | |||
| // C_B | |||
| float *const C_T = C; | |||
| float *const C_B = C + 2 * m1; | |||
| // D_TL D_TR | |||
| // 0 D_BR | |||
| const float *const D_TL = D; | |||
| const float *const D_TR = D + 2 * *ldD * m1; | |||
| const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1; | |||
| // F_T | |||
| // F_B | |||
| float *const F_T = F; | |||
| float *const F_B = F + 2 * m1; | |||
| if (*trans == 'N') { | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // F_T = F_T - D_TR * C_B | |||
| BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); | |||
| // C_B = C_B - A_TR^H * C_T | |||
| BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // C_B = C_B - D_TR^H * F_T | |||
| BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); | |||
| } | |||
| } | |||
| } else { | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const float *const B_TL = B; | |||
| const float *const B_TR = B + 2 * *ldB * n1; | |||
| const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
| // C_L C_R | |||
| float *const C_L = C; | |||
| float *const C_R = C + 2 * *ldC * n1; | |||
| // E_TL E_TR | |||
| // 0 E_BR | |||
| const float *const E_TL = E; | |||
| const float *const E_TR = E + 2 * *ldE * n1; | |||
| const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1; | |||
| // F_L F_R | |||
| float *const F_L = F; | |||
| float *const F_R = F + 2 * *ldF * n1; | |||
| if (*trans == 'N') { | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1); | |||
| // C_R = C_R + F_L * B_TR | |||
| BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); | |||
| // F_R = F_R + F_L * E_TR | |||
| BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); | |||
| // F_L = F_L + C_R * B_TR | |||
| BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); | |||
| // F_L = F_L + F_R * E_TR | |||
| BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); | |||
| } | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,163 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, | |||
| const int *, const int *, const float *, const int *, const float *, | |||
| const int *, float *, const int *, float *, int *); | |||
| /** CTRSYL solves the complex Sylvester matrix equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ctrsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html | |||
| * */ | |||
| void RELAPACK_ctrsyl( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int notransA = LAPACK(lsame)(tranA, "N"); | |||
| const int ctransA = LAPACK(lsame)(tranA, "C"); | |||
| const int notransB = LAPACK(lsame)(tranB, "N"); | |||
| const int ctransB = LAPACK(lsame)(tranB, "C"); | |||
| *info = 0; | |||
| if (!ctransA && !notransA) | |||
| *info = -1; | |||
| else if (!ctransB && !notransB) | |||
| *info = -2; | |||
| else if (*isgn != 1 && *isgn != -1) | |||
| *info = -3; | |||
| else if (*m < 0) | |||
| *info = -4; | |||
| else if (*n < 0) | |||
| *info = -5; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -7; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -9; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -11; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CTRSYL", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantranA = notransA ? 'N' : 'C'; | |||
| const char cleantranB = notransB ? 'N' : 'C'; | |||
| // Recursive kernel | |||
| RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| /** ctrsyl's recursive compute kernel */ | |||
| static void RELAPACK_ctrsyl_rec( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) { | |||
| // Unblocked | |||
| RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const float MSGN[] = { -*isgn, 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| float scale1[] = { 1., 0. }; | |||
| float scale2[] = { 1., 0. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| const int m1 = CREC_SPLIT(*m); | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const float *const A_TL = A; | |||
| const float *const A_TR = A + 2 * *ldA * m1; | |||
| const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
| // C_T | |||
| // C_B | |||
| float *const C_T = C; | |||
| float *const C_B = C + 2 * m1; | |||
| if (*tranA == 'N') { | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| } else { | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); | |||
| // C_B = C_B - A_TR' * C_T | |||
| BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); | |||
| } | |||
| } else { | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const float *const B_TL = B; | |||
| const float *const B_TR = B + 2 * *ldB * n1; | |||
| const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
| // C_L C_R | |||
| float *const C_L = C; | |||
| float *const C_R = C + 2 * *ldC * n1; | |||
| if (*tranB == 'N') { | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); | |||
| // C_R = C_R -/+ C_L * B_TR | |||
| BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| } else { | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); | |||
| // C_L = C_L -/+ C_R * B_TR' | |||
| BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,392 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "../config.h" | |||
| #include "f2c.h" | |||
| #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) { | |||
| extern void cdotu_(complex *, int *, complex *, int *, complex *, int *); | |||
| complex result; | |||
| cdotu_(&result, n, x, incx, y, incy); | |||
| return result; | |||
| } | |||
| #define cdotu_ cdotu_fun | |||
| complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) { | |||
| extern void cdotc_(complex *, int *, complex *, int *, complex *, int *); | |||
| complex result; | |||
| cdotc_(&result, n, x, incx, y, incy); | |||
| return result; | |||
| } | |||
| #define cdotc_ cdotc_fun | |||
| #endif | |||
| #if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| complex cladiv_fun(complex *a, complex *b) { | |||
| extern void cladiv_(complex *, complex *, complex *); | |||
| complex result; | |||
| cladiv_(&result, a, b); | |||
| return result; | |||
| } | |||
| #define cladiv_ cladiv_fun | |||
| #endif | |||
| /* Table of constant values */ | |||
| static int c__1 = 1; | |||
| /** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) | |||
| * | |||
| * This routine is an exact copy of LAPACK's ctrsyl. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int | |||
| *isgn, int *m, int *n, complex *a, int *lda, complex *b, | |||
| int *ldb, complex *c__, int *ldc, float *scale, int *info, | |||
| ftnlen trana_len, ftnlen tranb_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, | |||
| i__3, i__4; | |||
| float r__1, r__2; | |||
| complex q__1, q__2, q__3, q__4; | |||
| /* Builtin functions */ | |||
| float r_imag(complex *); | |||
| void r_cnjg(complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k, l; | |||
| static complex a11; | |||
| static float db; | |||
| static complex x11; | |||
| static float da11; | |||
| static complex vec; | |||
| static float dum[1], eps, sgn, smin; | |||
| static complex suml, sumr; | |||
| /* Complex */ complex cdotc_(int *, complex *, int | |||
| *, complex *, int *); | |||
| extern int lsame_(char *, char *, ftnlen, ftnlen); | |||
| /* Complex */ complex cdotu_(int *, complex *, int | |||
| *, complex *, int *); | |||
| extern /* Subroutine */ int slabad_(float *, float *); | |||
| extern float clange_(char *, int *, int *, complex *, | |||
| int *, float *, ftnlen); | |||
| /* Complex */ complex cladiv_(complex *, complex *); | |||
| static float scaloc; | |||
| extern float slamch_(char *, ftnlen); | |||
| extern /* Subroutine */ int csscal_(int *, float *, complex *, int | |||
| *), xerbla_(char *, int *, ftnlen); | |||
| static float bignum; | |||
| static int notrna, notrnb; | |||
| static float smlnum; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| b_dim1 = *ldb; | |||
| b_offset = 1 + b_dim1; | |||
| b -= b_offset; | |||
| c_dim1 = *ldc; | |||
| c_offset = 1 + c_dim1; | |||
| c__ -= c_offset; | |||
| /* Function Body */ | |||
| notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); | |||
| notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); | |||
| *info = 0; | |||
| if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) { | |||
| *info = -1; | |||
| } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { | |||
| *info = -2; | |||
| } else if (*isgn != 1 && *isgn != -1) { | |||
| *info = -3; | |||
| } else if (*m < 0) { | |||
| *info = -4; | |||
| } else if (*n < 0) { | |||
| *info = -5; | |||
| } else if (*lda < max(1,*m)) { | |||
| *info = -7; | |||
| } else if (*ldb < max(1,*n)) { | |||
| *info = -9; | |||
| } else if (*ldc < max(1,*m)) { | |||
| *info = -11; | |||
| } | |||
| if (*info != 0) { | |||
| i__1 = -(*info); | |||
| xerbla_("CTRSY2", &i__1, (ftnlen)6); | |||
| return; | |||
| } | |||
| *scale = 1.f; | |||
| if (*m == 0 || *n == 0) { | |||
| return; | |||
| } | |||
| eps = slamch_("P", (ftnlen)1); | |||
| smlnum = slamch_("S", (ftnlen)1); | |||
| bignum = 1.f / smlnum; | |||
| slabad_(&smlnum, &bignum); | |||
| smlnum = smlnum * (float) (*m * *n) / eps; | |||
| bignum = 1.f / smlnum; | |||
| /* Computing MAX */ | |||
| r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, ( | |||
| ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n, | |||
| &b[b_offset], ldb, dum, (ftnlen)1); | |||
| smin = dmax(r__1,r__2); | |||
| sgn = (float) (*isgn); | |||
| if (notrna && notrnb) { | |||
| i__1 = *n; | |||
| for (l = 1; l <= i__1; ++l) { | |||
| for (k = *m; k >= 1; --k) { | |||
| i__2 = *m - k; | |||
| /* Computing MIN */ | |||
| i__3 = k + 1; | |||
| /* Computing MIN */ | |||
| i__4 = k + 1; | |||
| q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[ | |||
| min(i__4,*m) + l * c_dim1], &c__1); | |||
| suml.r = q__1.r, suml.i = q__1.i; | |||
| i__2 = l - 1; | |||
| q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
| , &c__1); | |||
| sumr.r = q__1.r, sumr.i = q__1.i; | |||
| i__2 = k + l * c_dim1; | |||
| q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; | |||
| q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
| q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; | |||
| vec.r = q__1.r, vec.i = q__1.i; | |||
| scaloc = 1.f; | |||
| i__2 = k + k * a_dim1; | |||
| i__3 = l + l * b_dim1; | |||
| q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i; | |||
| q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i; | |||
| a11.r = q__1.r, a11.i = q__1.i; | |||
| da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
| dabs(r__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.f; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
| r__2)); | |||
| if (da11 < 1.f && db > 1.f) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1.f / db; | |||
| } | |||
| } | |||
| q__3.r = scaloc, q__3.i = 0.f; | |||
| q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
| q__3.i + vec.i * q__3.r; | |||
| q__1 = cladiv_(&q__2, &a11); | |||
| x11.r = q__1.r, x11.i = q__1.i; | |||
| if (scaloc != 1.f) { | |||
| i__2 = *n; | |||
| for (j = 1; j <= i__2; ++j) { | |||
| csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L10: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__2 = k + l * c_dim1; | |||
| c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
| /* L20: */ | |||
| } | |||
| /* L30: */ | |||
| } | |||
| } else if (! notrna && notrnb) { | |||
| i__1 = *n; | |||
| for (l = 1; l <= i__1; ++l) { | |||
| i__2 = *m; | |||
| for (k = 1; k <= i__2; ++k) { | |||
| i__3 = k - 1; | |||
| q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
| c_dim1 + 1], &c__1); | |||
| suml.r = q__1.r, suml.i = q__1.i; | |||
| i__3 = l - 1; | |||
| q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
| , &c__1); | |||
| sumr.r = q__1.r, sumr.i = q__1.i; | |||
| i__3 = k + l * c_dim1; | |||
| q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; | |||
| q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
| q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i; | |||
| vec.r = q__1.r, vec.i = q__1.i; | |||
| scaloc = 1.f; | |||
| r_cnjg(&q__2, &a[k + k * a_dim1]); | |||
| i__3 = l + l * b_dim1; | |||
| q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; | |||
| q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; | |||
| a11.r = q__1.r, a11.i = q__1.i; | |||
| da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
| dabs(r__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.f; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
| r__2)); | |||
| if (da11 < 1.f && db > 1.f) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1.f / db; | |||
| } | |||
| } | |||
| q__3.r = scaloc, q__3.i = 0.f; | |||
| q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
| q__3.i + vec.i * q__3.r; | |||
| q__1 = cladiv_(&q__2, &a11); | |||
| x11.r = q__1.r, x11.i = q__1.i; | |||
| if (scaloc != 1.f) { | |||
| i__3 = *n; | |||
| for (j = 1; j <= i__3; ++j) { | |||
| csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L40: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__3 = k + l * c_dim1; | |||
| c__[i__3].r = x11.r, c__[i__3].i = x11.i; | |||
| /* L50: */ | |||
| } | |||
| /* L60: */ | |||
| } | |||
| } else if (! notrna && ! notrnb) { | |||
| for (l = *n; l >= 1; --l) { | |||
| i__1 = *m; | |||
| for (k = 1; k <= i__1; ++k) { | |||
| i__2 = k - 1; | |||
| q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
| c_dim1 + 1], &c__1); | |||
| suml.r = q__1.r, suml.i = q__1.i; | |||
| i__2 = *n - l; | |||
| /* Computing MIN */ | |||
| i__3 = l + 1; | |||
| /* Computing MIN */ | |||
| i__4 = l + 1; | |||
| q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[ | |||
| l + min(i__4,*n) * b_dim1], ldb); | |||
| sumr.r = q__1.r, sumr.i = q__1.i; | |||
| i__2 = k + l * c_dim1; | |||
| r_cnjg(&q__4, &sumr); | |||
| q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; | |||
| q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
| q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; | |||
| vec.r = q__1.r, vec.i = q__1.i; | |||
| scaloc = 1.f; | |||
| i__2 = k + k * a_dim1; | |||
| i__3 = l + l * b_dim1; | |||
| q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; | |||
| q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i; | |||
| r_cnjg(&q__1, &q__2); | |||
| a11.r = q__1.r, a11.i = q__1.i; | |||
| da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
| dabs(r__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.f; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
| r__2)); | |||
| if (da11 < 1.f && db > 1.f) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1.f / db; | |||
| } | |||
| } | |||
| q__3.r = scaloc, q__3.i = 0.f; | |||
| q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
| q__3.i + vec.i * q__3.r; | |||
| q__1 = cladiv_(&q__2, &a11); | |||
| x11.r = q__1.r, x11.i = q__1.i; | |||
| if (scaloc != 1.f) { | |||
| i__2 = *n; | |||
| for (j = 1; j <= i__2; ++j) { | |||
| csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L70: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__2 = k + l * c_dim1; | |||
| c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
| /* L80: */ | |||
| } | |||
| /* L90: */ | |||
| } | |||
| } else if (notrna && ! notrnb) { | |||
| for (l = *n; l >= 1; --l) { | |||
| for (k = *m; k >= 1; --k) { | |||
| i__1 = *m - k; | |||
| /* Computing MIN */ | |||
| i__2 = k + 1; | |||
| /* Computing MIN */ | |||
| i__3 = k + 1; | |||
| q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[ | |||
| min(i__3,*m) + l * c_dim1], &c__1); | |||
| suml.r = q__1.r, suml.i = q__1.i; | |||
| i__1 = *n - l; | |||
| /* Computing MIN */ | |||
| i__2 = l + 1; | |||
| /* Computing MIN */ | |||
| i__3 = l + 1; | |||
| q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[ | |||
| l + min(i__3,*n) * b_dim1], ldb); | |||
| sumr.r = q__1.r, sumr.i = q__1.i; | |||
| i__1 = k + l * c_dim1; | |||
| r_cnjg(&q__4, &sumr); | |||
| q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; | |||
| q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
| q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i; | |||
| vec.r = q__1.r, vec.i = q__1.i; | |||
| scaloc = 1.f; | |||
| i__1 = k + k * a_dim1; | |||
| r_cnjg(&q__3, &b[l + l * b_dim1]); | |||
| q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i; | |||
| q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i; | |||
| a11.r = q__1.r, a11.i = q__1.i; | |||
| da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
| dabs(r__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.f; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
| r__2)); | |||
| if (da11 < 1.f && db > 1.f) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1.f / db; | |||
| } | |||
| } | |||
| q__3.r = scaloc, q__3.i = 0.f; | |||
| q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
| q__3.i + vec.i * q__3.r; | |||
| q__1 = cladiv_(&q__2, &a11); | |||
| x11.r = q__1.r, x11.i = q__1.i; | |||
| if (scaloc != 1.f) { | |||
| i__1 = *n; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L100: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__1 = k + l * c_dim1; | |||
| c__[i__1].r = x11.r, c__[i__1].i = x11.i; | |||
| /* L110: */ | |||
| } | |||
| /* L120: */ | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,107 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, | |||
| float *, const int *, int *); | |||
| /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ctrtri. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html | |||
| * */ | |||
| void RELAPACK_ctrtri( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int nounit = LAPACK(lsame)(diag, "N"); | |||
| const int unit = LAPACK(lsame)(diag, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (!nounit && !unit) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CTRTRI", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleandiag = nounit ? 'N' : 'U'; | |||
| // check for singularity | |||
| if (nounit) { | |||
| int i; | |||
| for (i = 0; i < *n; i++) | |||
| if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { | |||
| *info = i; | |||
| return; | |||
| } | |||
| } | |||
| // Recursive kernel | |||
| RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); | |||
| } | |||
| /** ctrtri's recursive compute kernel */ | |||
| static void RELAPACK_ctrtri_rec( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_CTRTRI, 1)) { | |||
| // Unblocked | |||
| LAPACK(ctrti2)(uplo, diag, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = - A_BL * A_TL | |||
| BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BL = A_BR \ A_BL | |||
| BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TR = - A_TL * A_TR | |||
| BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); | |||
| // A_TR = A_TR / A_BR | |||
| BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,227 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, | |||
| const int *, double *, const int *, int *, double *, const int *, double *, | |||
| const int *, int *); | |||
| /** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dgbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_dgbtrf( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kl < 0) | |||
| *info = -3; | |||
| else if (*ku < 0) | |||
| *info = -4; | |||
| else if (*ldAb < 2 * *kl + *ku + 1) | |||
| *info = -6; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DGBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Constant | |||
| const double ZERO[] = { 0. }; | |||
| // Result upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| double *const A = Ab + kv; | |||
| // Zero upper diagonal fill-in elements | |||
| int i, j; | |||
| for (j = 0; j < *n; j++) { | |||
| double *const A_j = A + *ldA * j; | |||
| for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
| A_j[i] = 0.; | |||
| } | |||
| // Allocate work space | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
| const int nWorkl = (kv > n1) ? n1 : kv; | |||
| const int mWorku = (*kl > n1) ? n1 : *kl; | |||
| const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
| double *Workl = malloc(mWorkl * nWorkl * sizeof(double)); | |||
| double *Worku = malloc(mWorku * nWorku * sizeof(double)); | |||
| LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
| LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
| // Recursive kernel | |||
| RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
| // Free work space | |||
| free(Workl); | |||
| free(Worku); | |||
| } | |||
| /** dgbtrf's recursive compute kernel */ | |||
| static void RELAPACK_dgbtrf_rec( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterators | |||
| int i, j; | |||
| // Output upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| double *const A = Ab + kv; | |||
| // Splitting | |||
| const int n1 = MIN(DREC_SPLIT(*n), *kl); | |||
| const int n2 = *n - n1; | |||
| const int m1 = MIN(n1, *m); | |||
| const int m2 = *m - m1; | |||
| const int mn1 = MIN(m1, n1); | |||
| const int mn2 = MIN(m2, n2); | |||
| // Ab_L * | |||
| // Ab_BR | |||
| double *const Ab_L = Ab; | |||
| double *const Ab_BR = Ab + *ldAb * n1; | |||
| // A_L A_R | |||
| double *const A_L = A; | |||
| double *const A_R = A + *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + m1; | |||
| double *const A_BR = A + *ldA * n1 + m1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, kv - n1); | |||
| const int n22 = MIN(n2 - n21, n1); | |||
| const int m21 = MIN(m2, *kl - m1); | |||
| const int m22 = MIN(m2 - m21, m1); | |||
| // n1 n21 n22 | |||
| // m * A_Rl ARr | |||
| double *const A_Rl = A_R; | |||
| double *const A_Rr = A_R + *ldA * n21; | |||
| // n1 n21 n22 | |||
| // m1 * A_TRl A_TRr | |||
| // m21 A_BLt A_BRtl A_BRtr | |||
| // m22 A_BLb A_BRbl A_BRbr | |||
| double *const A_TRl = A_TR; | |||
| double *const A_TRr = A_TR + *ldA * n21; | |||
| double *const A_BLt = A_BL; | |||
| double *const A_BLb = A_BL + m21; | |||
| double *const A_BRtl = A_BR; | |||
| double *const A_BRtr = A_BR + *ldA * n21; | |||
| double *const A_BRbl = A_BR + m21; | |||
| double *const A_BRbr = A_BR + *ldA * n21 + m21; | |||
| // recursion(Ab_L, ipiv_T) | |||
| RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
| // Workl = A_BLb | |||
| LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
| // partially redo swaps in A_L | |||
| for (i = 0; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
| else | |||
| BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
| } | |||
| } | |||
| // apply pivots to A_Rl | |||
| LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
| // apply pivots to A_Rr columnwise | |||
| for (j = 0; j < n22; j++) { | |||
| double *const A_Rrj = A_Rr + *ldA * j; | |||
| for (i = j; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| const double tmp = A_Rrj[i]; | |||
| A_Rrj[i] = A_Rr[ip]; | |||
| A_Rrj[ip] = tmp; | |||
| } | |||
| } | |||
| } | |||
| // A_TRl = A_TL \ A_TRl | |||
| BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // Worku = A_TRr | |||
| LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
| // Worku = A_TL \ Worku | |||
| BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
| // A_TRr = Worku | |||
| LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_TRl | |||
| BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // A_BRbl = A_BRbl - Workl * A_TRl | |||
| BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRtr = A_BRtr - A_BLt * Worku | |||
| BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Workl * Worku | |||
| BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
| // partially undo swaps in A_L | |||
| for (i = mn1 - 1; i >= 0; i--) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
| else | |||
| BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
| } | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| for (i = 0; i < mn2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,165 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dgemmt_rec(const char *, const char *, const char *, | |||
| const int *, const int *, const double *, const double *, const int *, | |||
| const double *, const int *, const double *, double *, const int *); | |||
| static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, | |||
| const int *, const int *, const double *, const double *, const int *, | |||
| const double *, const int *, const double *, double *, const int *); | |||
| /** DGEMMT computes a matrix-matrix product with general matrices but updates | |||
| * only the upper or lower triangular part of the result matrix. | |||
| * | |||
| * This routine performs the same operation as the BLAS routine | |||
| * dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
| * but only updates the triangular part of C specified by uplo: | |||
| * If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
| * otherwise the upper triangular part is updated. | |||
| * */ | |||
| void RELAPACK_dgemmt( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| #if HAVE_XGEMMT | |||
| BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| #else | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int notransA = LAPACK(lsame)(transA, "N"); | |||
| const int tranA = LAPACK(lsame)(transA, "T"); | |||
| const int notransB = LAPACK(lsame)(transB, "N"); | |||
| const int tranB = LAPACK(lsame)(transB, "T"); | |||
| int info = 0; | |||
| if (!lower && !upper) | |||
| info = 1; | |||
| else if (!tranA && !notransA) | |||
| info = 2; | |||
| else if (!tranB && !notransB) | |||
| info = 3; | |||
| else if (*n < 0) | |||
| info = 4; | |||
| else if (*k < 0) | |||
| info = 5; | |||
| else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
| info = 8; | |||
| else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
| info = 10; | |||
| else if (*ldC < MAX(1, *n)) | |||
| info = 13; | |||
| if (info) { | |||
| LAPACK(xerbla)("DGEMMT", &info); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleantransA = notransA ? 'N' : 'T'; | |||
| const char cleantransB = notransB ? 'N' : 'T'; | |||
| // Recursive kernel | |||
| RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| #endif | |||
| } | |||
| /** dgemmt's recursive compute kernel */ | |||
| static void RELAPACK_dgemmt_rec( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_DGEMMT, 1)) { | |||
| // Unblocked | |||
| RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| } | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_T | |||
| // A_B | |||
| const double *const A_T = A; | |||
| const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1); | |||
| // B_L B_R | |||
| const double *const B_L = B; | |||
| const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1); | |||
| // C_TL C_TR | |||
| // C_BL C_BR | |||
| double *const C_TL = C; | |||
| double *const C_TR = C + *ldC * n1; | |||
| double *const C_BL = C + n1; | |||
| double *const C_BR = C + *ldC * n1 + n1; | |||
| // recursion(C_TL) | |||
| RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
| if (*uplo == 'L') | |||
| // C_BL = alpha A_B B_L + beta C_BL | |||
| BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
| else | |||
| // C_TR = alpha A_T B_R + beta C_TR | |||
| BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
| // recursion(C_BR) | |||
| RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
| } | |||
| /** dgemmt's unblocked compute kernel */ | |||
| static void RELAPACK_dgemmt_rec2( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| const int incB = (*transB == 'N') ? 1 : *ldB; | |||
| const int incC = 1; | |||
| int i; | |||
| for (i = 0; i < *n; i++) { | |||
| // A_0 | |||
| // A_i | |||
| const double *const A_0 = A; | |||
| const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i); | |||
| // * B_i * | |||
| const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i); | |||
| // * C_0i * | |||
| // * C_ii * | |||
| double *const C_0i = C + *ldC * i; | |||
| double *const C_ii = C + *ldC * i + i; | |||
| if (*uplo == 'L') { | |||
| const int nmi = *n - i; | |||
| if (*transA == 'N') | |||
| BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| else | |||
| BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| } else { | |||
| const int ip1 = i + 1; | |||
| if (*transA == 'N') | |||
| BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| else | |||
| BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dgetrf_rec(const int *, const int *, double *, | |||
| const int *, int *, int *); | |||
| /** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dgetrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html | |||
| * */ | |||
| void RELAPACK_dgetrf( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DGETRF", &minfo); | |||
| return; | |||
| } | |||
| const int sn = MIN(*m, *n); | |||
| RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
| // Right remainder | |||
| if (*m < *n) { | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const int iONE[] = { 1. }; | |||
| // Splitting | |||
| const int rn = *n - *m; | |||
| // A_L A_R | |||
| const double *const A_L = A; | |||
| double *const A_R = A + *ldA * *m; | |||
| // A_R = apply(ipiv, A_R) | |||
| LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
| // A_R = A_S \ A_R | |||
| BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
| } | |||
| } | |||
| /** dgetrf's recursive compute kernel */ | |||
| static void RELAPACK_dgetrf_rec( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_DGETRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(dgetf2)(m, n, A, ldA, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| const int m2 = *m - n1; | |||
| // A_L A_R | |||
| double *const A_L = A; | |||
| double *const A_R = A + *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // recursion(A_L, ipiv_T) | |||
| RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
| // apply pivots to A_R | |||
| LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
| // A_TR = A_TL \ A_TR | |||
| BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_BL * A_TR | |||
| BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
| // recursion(A_BR, ipiv_B) | |||
| RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // apply pivots to A_BL | |||
| LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,87 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dlauum_rec(const char *, const int *, double *, | |||
| const int *, int *); | |||
| /** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dlauum. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html | |||
| * */ | |||
| void RELAPACK_dlauum( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DLAUUM", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** dlauum's recursive compute kernel */ | |||
| static void RELAPACK_dlauum_rec( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_DLAUUM, 1)) { | |||
| // Unblocked | |||
| LAPACK(dlauu2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*uplo == 'L') { | |||
| // A_TL = A_TL + A_BL' * A_BL | |||
| BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
| // A_BL = A_BR' * A_BL | |||
| BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TL = A_TL + A_TR * A_TR' | |||
| BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
| // A_TR = A_TR * A_BR' | |||
| BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info); | |||
| } | |||
| @@ -0,0 +1,157 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, | |||
| double *, const int *, double *, const int *, int *); | |||
| /** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dpbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_dpbtrf( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kd < 0) | |||
| *info = -3; | |||
| else if (*ldAb < *kd + 1) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DPBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Constant | |||
| const double ZERO[] = { 0. }; | |||
| // Allocate work space | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
| const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
| double *Work = malloc(mWork * nWork * sizeof(double)); | |||
| LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
| // Recursive kernel | |||
| RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
| // Free work space | |||
| free(Work); | |||
| } | |||
| /** dpbtrf's recursive compute kernel */ | |||
| static void RELAPACK_dpbtrf_rec( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| double *Work, const int *ldWork, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| double *const A = Ab + ((*uplo == 'L') ? 0 : *kd); | |||
| // Splitting | |||
| const int n1 = MIN(DREC_SPLIT(*n), *kd); | |||
| const int n2 = *n - n1; | |||
| // * * | |||
| // * Ab_BR | |||
| double *const Ab_BR = Ab + *ldAb * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, *kd - n1); | |||
| const int n22 = MIN(n2 - n21, n1); | |||
| // n1 n21 n22 | |||
| // n1 * A_TRl A_TRr | |||
| // n21 A_BLt A_BRtl A_BRtr | |||
| // n22 A_BLb A_BRbl A_BRbr | |||
| double *const A_TRl = A_TR; | |||
| double *const A_TRr = A_TR + *ldA * n21; | |||
| double *const A_BLt = A_BL; | |||
| double *const A_BLb = A_BL + n21; | |||
| double *const A_BRtl = A_BR; | |||
| double *const A_BRtr = A_BR + *ldA * n21; | |||
| double *const A_BRbl = A_BR + n21; | |||
| double *const A_BRbr = A_BR + *ldA * n21 + n21; | |||
| if (*uplo == 'L') { | |||
| // A_BLt = ABLt / A_TL' | |||
| BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
| BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_BLb | |||
| LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
| // Work = Work / A_TL' | |||
| BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRbl = A_BRbl - Work * A_BLt' | |||
| BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRbr = A_BRbr - Work * Work' | |||
| BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_BLb = Work | |||
| LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
| } else { | |||
| // A_TRl = A_TL' \ A_TRl | |||
| BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
| BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_TRr | |||
| LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
| // Work = A_TL' \ Work | |||
| BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRtr = A_BRtr - A_TRl' * Work | |||
| BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Work' * Work | |||
| BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_TRr = Work | |||
| LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| if (*kd > n1) | |||
| RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info); | |||
| else | |||
| RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,92 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dpotrf_rec(const char *, const int *, double *, | |||
| const int *, int *); | |||
| /** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dpotrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html | |||
| * */ | |||
| void RELAPACK_dpotrf( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DPOTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** dpotrf's recursive compute kernel */ | |||
| static void RELAPACK_dpotrf_rec( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_DPOTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(dpotf2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / A_TL' | |||
| BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * A_BL' | |||
| BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
| } else { | |||
| // A_TR = A_TL' \ A_TR | |||
| BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * A_TR | |||
| BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,212 @@ | |||
| #include "relapack.h" | |||
| #if XSYGST_ALLOW_MALLOC | |||
| #include "stdlib.h" | |||
| #endif | |||
| static void RELAPACK_dsygst_rec(const int *, const char *, const int *, | |||
| double *, const int *, const double *, const int *, | |||
| double *, const int *, int *); | |||
| /** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dsygst. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html | |||
| * */ | |||
| void RELAPACK_dsygst( | |||
| const int *itype, const char *uplo, const int *n, | |||
| double *A, const int *ldA, const double *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (*itype < 1 || *itype > 3) | |||
| *info = -1; | |||
| else if (!lower && !upper) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -7; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DSYGST", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Allocate work space | |||
| double *Work = NULL; | |||
| int lWork = 0; | |||
| #if XSYGST_ALLOW_MALLOC | |||
| const int n1 = DREC_SPLIT(*n); | |||
| lWork = n1 * (*n - n1); | |||
| Work = malloc(lWork * sizeof(double)); | |||
| if (!Work) | |||
| lWork = 0; | |||
| #endif | |||
| // recursive kernel | |||
| RELAPACK_dsygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); | |||
| // Free work space | |||
| #if XSYGST_ALLOW_MALLOC | |||
| if (Work) | |||
| free(Work); | |||
| #endif | |||
| } | |||
| /** dsygst's recursive compute kernel */ | |||
| static void RELAPACK_dsygst_rec( | |||
| const int *itype, const char *uplo, const int *n, | |||
| double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_SSYGST, 1)) { | |||
| // Unblocked | |||
| LAPACK(dsygs2)(itype, uplo, n, A, ldA, B, ldB, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ZERO[] = { 0. }; | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| const double HALF[] = { .5 }; | |||
| const double MHALF[] = { -.5 }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // B_TL B_TR | |||
| // B_BL B_BR | |||
| const double *const B_TL = B; | |||
| const double *const B_TR = B + *ldB * n1; | |||
| const double *const B_BL = B + n1; | |||
| const double *const B_BR = B + *ldB * n1 + n1; | |||
| // recursion(A_TL, B_TL) | |||
| RELAPACK_dsygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); | |||
| if (*itype == 1) | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / B_TL' | |||
| BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = -1/2 * B_BL * A_TL | |||
| BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' | |||
| BLAS(dsyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR \ A_BL | |||
| BLAS(dtrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL' \ A_TR | |||
| BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = -1/2 * A_TL * B_TR | |||
| BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR | |||
| BLAS(dsyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR / B_BR | |||
| BLAS(dtrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| else | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL * B_TL | |||
| BLAS(dtrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = 1/2 * A_BR * B_BL | |||
| BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL | |||
| BLAS(dsyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR * A_BL | |||
| BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL * A_TR | |||
| BLAS(dtrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = 1/2 * B_TR * A_BR | |||
| BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR + 1/2 B_TR A_BR | |||
| BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' | |||
| BLAS(dsyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR + 1/2 B_TR * A_BR | |||
| BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR * B_BR | |||
| BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR, B_BR) | |||
| RELAPACK_dsygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); | |||
| } | |||
| @@ -0,0 +1,238 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *, | |||
| double *, const int *, int *, double *, const int *, int *); | |||
| /** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dsytrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html | |||
| * */ | |||
| void RELAPACK_dsytrf( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| double *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * sizeof(double)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy arguments | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** dsytrf's recursive compute kernel */ | |||
| static void RELAPACK_dsytrf_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_DSYTRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_dsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = DREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| double *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| double *const A_BL_B = A + *n; | |||
| double *const A_BR_B = A + *ldA * n1 + *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| double *const Work_BL = Work + n1; | |||
| double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; | |||
| // last row of A_BL | |||
| double *const A_BL_b = A_BL + n2_out; | |||
| // last row of Work_BL | |||
| double *const Work_BL_b = Work_BL + n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = DREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| double *const Work_R = top ? Work : Work + *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| double *const A_TL_T = A + *ldA * n_rest; | |||
| double *const A_TR_T = A + *ldA * (n_rest + n1); | |||
| double *const A_TL = A + *ldA * n_rest + n_rest; | |||
| double *const A_TR = A + *ldA * (n_rest + n1) + n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| double *const Work_L = Work; | |||
| double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,352 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static int c__1 = 1; | |||
| static double c_b8 = -1.; | |||
| static double c_b9 = 1.; | |||
| /** DSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's dlasyf. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int * | |||
| nb, int *kb, double *a, int *lda, int *ipiv, | |||
| double *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; | |||
| double d__1, d__2, d__3; | |||
| /* Builtin functions */ | |||
| double sqrt(double); | |||
| /* Local variables */ | |||
| static int j, k; | |||
| static double t, r1, d11, d21, d22; | |||
| static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
| static double alpha; | |||
| extern /* Subroutine */ int dscal_(int *, double *, double *, | |||
| int *); | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int dgemv_(char *, int *, int *, | |||
| double *, double *, int *, double *, int *, | |||
| double *, double *, int *, ftnlen), dcopy_(int *, | |||
| double *, int *, double *, int *), dswap_(int | |||
| *, double *, int *, double *, int *); | |||
| static int kstep; | |||
| static double absakk; | |||
| extern int idamax_(int *, double *, int *); | |||
| static double colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.) + 1.) / 8.; | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| kstep = 1; | |||
| absakk = (d__1 = w[k + kw * w_dim1], abs(d__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| i__1 = k - imax; | |||
| jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
| &c__1); | |||
| rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1)); | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| jmax = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| /* Computing MAX */ | |||
| d__2 = rowmax, d__3 = (d__1 = w[jmax + (kw - 1) * w_dim1], | |||
| abs(d__1)); | |||
| rowmax = max(d__2,d__3); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else if ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) >= | |||
| alpha * rowmax) { | |||
| kp = imax; | |||
| dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kp != kk) { | |||
| a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; | |||
| i__1 = kk - 1 - kp; | |||
| dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| dcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| dswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| r1 = 1. / a[k + k * a_dim1]; | |||
| i__1 = k - 1; | |||
| dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (k > 2) { | |||
| d21 = w[k - 1 + kw * w_dim1]; | |||
| d11 = w[k + kw * w_dim1] / d21; | |||
| d22 = w[k - 1 + (kw - 1) * w_dim1] / d21; | |||
| t = 1. / (d11 * d22 - 1.); | |||
| d21 = t / d21; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1) | |||
| * w_dim1] - w[j + kw * w_dim1]); | |||
| a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] - | |||
| w[j + (kw - 1) * w_dim1]); | |||
| /* L20: */ | |||
| } | |||
| } | |||
| a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; | |||
| a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; | |||
| a[k + k * a_dim1] = w[k + kw * w_dim1]; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| ++j; | |||
| } | |||
| ++j; | |||
| if (jp != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| dswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
| } | |||
| if (j < *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| i__1 = *n - k + 1; | |||
| dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k | |||
| + w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
| kstep = 1; | |||
| absakk = (d__1 = w[k + k * w_dim1], abs(d__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| colmax = (d__1 = w[imax + k * w_dim1], abs(d__1)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - k; | |||
| dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], | |||
| lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) * | |||
| w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
| ; | |||
| rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1)); | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| jmax = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| /* Computing MAX */ | |||
| d__2 = rowmax, d__3 = (d__1 = w[jmax + (k + 1) * w_dim1], | |||
| abs(d__1)); | |||
| rowmax = max(d__2,d__3); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else if ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) >= | |||
| alpha * rowmax) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kp != kk) { | |||
| a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; | |||
| i__1 = kp - kk - 1; | |||
| dcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| dswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| r1 = 1. / a[k + k * a_dim1]; | |||
| i__1 = *n - k; | |||
| dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| d21 = w[k + 1 + k * w_dim1]; | |||
| d11 = w[k + 1 + (k + 1) * w_dim1] / d21; | |||
| d22 = w[k + k * w_dim1] / d21; | |||
| t = 1. / (d11 * d22 - 1.); | |||
| d21 = t / d21; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] - | |||
| w[j + (k + 1) * w_dim1]); | |||
| a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) * | |||
| w_dim1] - w[j + k * w_dim1]); | |||
| /* L80: */ | |||
| } | |||
| } | |||
| a[k + k * a_dim1] = w[k + k * w_dim1]; | |||
| a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; | |||
| a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| --j; | |||
| } | |||
| --j; | |||
| if (jp != jj && j >= 1) { | |||
| dswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j > 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *, | |||
| double *, const int *, int *, double *, const int *, int *); | |||
| /** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dsytrf_rook. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html | |||
| * */ | |||
| void RELAPACK_dsytrf_rook( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| double *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * sizeof(double)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** dsytrf_rook's recursive compute kernel */ | |||
| static void RELAPACK_dsytrf_rook_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_dsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = DREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| double *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| double *const A_BL_B = A + *n; | |||
| double *const A_BR_B = A + *ldA * n1 + *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| double *const Work_BL = Work + n1; | |||
| double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; | |||
| // last row of A_BL | |||
| double *const A_BL_b = A_BL + n2_out; | |||
| // last row of Work_BL | |||
| double *const Work_BL_b = Work_BL + n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = DREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| double *const Work_R = top ? Work : Work + *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| double *const A_TL_T = A + *ldA * n_rest; | |||
| double *const A_TR_T = A + *ldA * (n_rest + n1); | |||
| double *const A_TL = A + *ldA * n_rest + n_rest; | |||
| double *const A_TR = A + *ldA * (n_rest + n1) + n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| double *const Work_L = Work; | |||
| double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,451 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static int c__1 = 1; | |||
| static double c_b9 = -1.; | |||
| static double c_b10 = 1.; | |||
| /** DSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's dlasyf. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n, | |||
| int *nb, int *kb, double *a, int *lda, int *ipiv, | |||
| double *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; | |||
| double d__1; | |||
| /* Builtin functions */ | |||
| double sqrt(double); | |||
| /* Local variables */ | |||
| static int j, k, p; | |||
| static double t, r1, d11, d12, d21, d22; | |||
| static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
| static logical done; | |||
| static int imax, jmax; | |||
| static double alpha; | |||
| extern /* Subroutine */ int dscal_(int *, double *, double *, | |||
| int *); | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int dgemv_(char *, int *, int *, | |||
| double *, double *, int *, double *, int *, | |||
| double *, double *, int *, ftnlen); | |||
| static double dtemp, sfmin; | |||
| static int itemp; | |||
| extern /* Subroutine */ int dcopy_(int *, double *, int *, | |||
| double *, int *), dswap_(int *, double *, int | |||
| *, double *, int *); | |||
| static int kstep; | |||
| extern double dlamch_(char *, ftnlen); | |||
| static double absakk; | |||
| extern int idamax_(int *, double *, int *); | |||
| static double colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.) + 1.) / 8.; | |||
| sfmin = dlamch_("S", (ftnlen)1); | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| absakk = (d__1 = w[k + kw * w_dim1], abs(d__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L12: | |||
| dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = k - imax; | |||
| jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
| w_dim1], &c__1); | |||
| rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1)); | |||
| } else { | |||
| rowmax = 0.; | |||
| } | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| itemp = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| dtemp = (d__1 = w[itemp + (kw - 1) * w_dim1], abs(d__1)); | |||
| if (dtemp > rowmax) { | |||
| rowmax = dtemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| if (! ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) < | |||
| alpha * rowmax)) { | |||
| kp = imax; | |||
| dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L12; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = k - p; | |||
| dcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
| a_dim1], lda); | |||
| dcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - k + 1; | |||
| dswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| dswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
| ldw); | |||
| } | |||
| if (kp != kk) { | |||
| a[kp + k * a_dim1] = a[kk + k * a_dim1]; | |||
| i__1 = k - 1 - kp; | |||
| dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| dcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - kk + 1; | |||
| dswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) { | |||
| r1 = 1. / a[k + k * a_dim1]; | |||
| i__1 = k - 1; | |||
| dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else if (a[k + k * a_dim1] != 0.) { | |||
| i__1 = k - 1; | |||
| for (ii = 1; ii <= i__1; ++ii) { | |||
| a[ii + k * a_dim1] /= a[k + k * a_dim1]; | |||
| /* L14: */ | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| d12 = w[k - 1 + kw * w_dim1]; | |||
| d11 = w[k + kw * w_dim1] / d12; | |||
| d22 = w[k - 1 + (kw - 1) * w_dim1] / d12; | |||
| t = 1. / (d11 * d22 - 1.); | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) * | |||
| w_dim1] - w[j + kw * w_dim1]) / d12); | |||
| a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] - | |||
| w[j + (kw - 1) * w_dim1]) / d12); | |||
| /* L20: */ | |||
| } | |||
| } | |||
| a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; | |||
| a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; | |||
| a[k + k * a_dim1] = w[k + kw * w_dim1]; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| ++j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| ++j; | |||
| if (jp2 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| dswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| jj = j - 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| i__1 = *n - j + 1; | |||
| dswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| if (j <= *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| i__1 = *n - k + 1; | |||
| dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, & | |||
| w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| absakk = (d__1 = w[k + k * w_dim1], abs(d__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| colmax = (d__1 = w[imax + k * w_dim1], abs(d__1)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = *n - k + 1; | |||
| dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L72: | |||
| i__1 = imax - k; | |||
| dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1] | |||
| , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k | |||
| + 1) * w_dim1], &c__1, (ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
| c__1); | |||
| rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1)); | |||
| } else { | |||
| rowmax = 0.; | |||
| } | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| itemp = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| dtemp = (d__1 = w[itemp + (k + 1) * w_dim1], abs(d__1)); | |||
| if (dtemp > rowmax) { | |||
| rowmax = dtemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| if (! ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) < alpha | |||
| * rowmax)) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| i__1 = *n - k + 1; | |||
| dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L72; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p - k; | |||
| dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - p + 1; | |||
| dcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & | |||
| c__1); | |||
| dswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
| dswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
| } | |||
| if (kp != kk) { | |||
| a[kp + k * a_dim1] = a[kk + k * a_dim1]; | |||
| i__1 = kp - k - 1; | |||
| dcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) | |||
| * a_dim1], lda); | |||
| i__1 = *n - kp + 1; | |||
| dcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * | |||
| a_dim1], &c__1); | |||
| dswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) { | |||
| r1 = 1. / a[k + k * a_dim1]; | |||
| i__1 = *n - k; | |||
| dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } else if (a[k + k * a_dim1] != 0.) { | |||
| i__1 = *n; | |||
| for (ii = k + 1; ii <= i__1; ++ii) { | |||
| a[ii + k * a_dim1] /= a[k + k * a_dim1]; | |||
| /* L74: */ | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| d21 = w[k + 1 + k * w_dim1]; | |||
| d11 = w[k + 1 + (k + 1) * w_dim1] / d21; | |||
| d22 = w[k + k * w_dim1] / d21; | |||
| t = 1. / (d11 * d22 - 1.); | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[ | |||
| j + (k + 1) * w_dim1]) / d21); | |||
| a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) * | |||
| w_dim1] - w[j + k * w_dim1]) / d21); | |||
| /* L80: */ | |||
| } | |||
| } | |||
| a[k + k * a_dim1] = w[k + k * w_dim1]; | |||
| a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; | |||
| a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| --j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| --j; | |||
| if (jp2 != jj && j >= 1) { | |||
| dswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| jj = j + 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| dswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j >= 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,274 @@ | |||
| #include "relapack.h" | |||
| #include <math.h> | |||
| static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *, | |||
| const int *, const double *, const int *, const double *, const int *, | |||
| double *, const int *, const double *, const int *, const double *, | |||
| const int *, double *, const int *, double *, double *, double *, int *, | |||
| int *, int *); | |||
| /** DTGSYL solves the generalized Sylvester equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dtgsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html | |||
| * */ | |||
| void RELAPACK_dtgsyl( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, | |||
| const double *D, const int *ldD, const double *E, const int *ldE, | |||
| double *F, const int *ldF, | |||
| double *scale, double *dif, | |||
| double *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| // Parse arguments | |||
| const int notran = LAPACK(lsame)(trans, "N"); | |||
| const int tran = LAPACK(lsame)(trans, "T"); | |||
| // Compute work buffer size | |||
| int lwmin = 1; | |||
| if (notran && (*ijob == 1 || *ijob == 2)) | |||
| lwmin = MAX(1, 2 * *m * *n); | |||
| *info = 0; | |||
| // Check arguments | |||
| if (!tran && !notran) | |||
| *info = -1; | |||
| else if (notran && (*ijob < 0 || *ijob > 4)) | |||
| *info = -2; | |||
| else if (*m <= 0) | |||
| *info = -3; | |||
| else if (*n <= 0) | |||
| *info = -4; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -6; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -8; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -10; | |||
| else if (*ldD < MAX(1, *m)) | |||
| *info = -12; | |||
| else if (*ldE < MAX(1, *n)) | |||
| *info = -14; | |||
| else if (*ldF < MAX(1, *m)) | |||
| *info = -16; | |||
| else if (*lWork < lwmin && *lWork != -1) | |||
| *info = -20; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DTGSYL", &minfo); | |||
| return; | |||
| } | |||
| if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = lwmin; | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantrans = notran ? 'N' : 'T'; | |||
| // Constant | |||
| const double ZERO[] = { 0. }; | |||
| int isolve = 1; | |||
| int ifunc = 0; | |||
| if (notran) { | |||
| if (*ijob >= 3) { | |||
| ifunc = *ijob - 2; | |||
| LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else if (*ijob >= 1) | |||
| isolve = 2; | |||
| } | |||
| double scale2; | |||
| int iround; | |||
| for (iround = 1; iround <= isolve; iround++) { | |||
| *scale = 1; | |||
| double dscale = 0; | |||
| double dsum = 1; | |||
| int pq; | |||
| RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); | |||
| if (dscale != 0) { | |||
| if (*ijob == 1 || *ijob == 3) | |||
| *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); | |||
| else | |||
| *dif = sqrt(pq) / (dscale * sqrt(dsum)); | |||
| } | |||
| if (isolve == 2) { | |||
| if (iround == 1) { | |||
| if (notran) | |||
| ifunc = *ijob; | |||
| scale2 = *scale; | |||
| LAPACK(dlacpy)("F", m, n, C, ldC, Work, m); | |||
| LAPACK(dlacpy)("F", m, n, F, ldF, Work + *m * *n, m); | |||
| LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else { | |||
| LAPACK(dlacpy)("F", m, n, Work, m, C, ldC); | |||
| LAPACK(dlacpy)("F", m, n, Work + *m * *n, m, F, ldF); | |||
| *scale = scale2; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| /** dtgsyl's recursive vompute kernel */ | |||
| static void RELAPACK_dtgsyl_rec( | |||
| const char *trans, const int *ifunc, const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, | |||
| const double *D, const int *ldD, const double *E, const int *ldE, | |||
| double *F, const int *ldF, | |||
| double *scale, double *dsum, double *dscale, | |||
| int *iWork, int *pq, int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) { | |||
| // Unblocked | |||
| LAPACK(dtgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| double scale1[] = { 1. }; | |||
| double scale2[] = { 1. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| int m1 = DREC_SPLIT(*m); | |||
| if (A[m1 + *ldA * (m1 - 1)]) | |||
| m1++; | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const double *const A_TL = A; | |||
| const double *const A_TR = A + *ldA * m1; | |||
| const double *const A_BR = A + *ldA * m1 + m1; | |||
| // C_T | |||
| // C_B | |||
| double *const C_T = C; | |||
| double *const C_B = C + m1; | |||
| // D_TL D_TR | |||
| // 0 D_BR | |||
| const double *const D_TL = D; | |||
| const double *const D_TR = D + *ldD * m1; | |||
| const double *const D_BR = D + *ldD * m1 + m1; | |||
| // F_T | |||
| // F_B | |||
| double *const F_T = F; | |||
| double *const F_B = F + m1; | |||
| if (*trans == 'N') { | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // F_T = F_T - D_TR * C_B | |||
| BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); | |||
| // C_B = C_B - A_TR^H * C_T | |||
| BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // C_B = C_B - D_TR^H * F_T | |||
| BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); | |||
| } | |||
| } | |||
| } else { | |||
| // Splitting | |||
| int n1 = DREC_SPLIT(*n); | |||
| if (B[n1 + *ldB * (n1 - 1)]) | |||
| n1++; | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const double *const B_TL = B; | |||
| const double *const B_TR = B + *ldB * n1; | |||
| const double *const B_BR = B + *ldB * n1 + n1; | |||
| // C_L C_R | |||
| double *const C_L = C; | |||
| double *const C_R = C + *ldC * n1; | |||
| // E_TL E_TR | |||
| // 0 E_BR | |||
| const double *const E_TL = E; | |||
| const double *const E_TR = E + *ldE * n1; | |||
| const double *const E_BR = E + *ldE * n1 + n1; | |||
| // F_L F_R | |||
| double *const F_L = F; | |||
| double *const F_R = F + *ldF * n1; | |||
| if (*trans == 'N') { | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
| // C_R = C_R + F_L * B_TR | |||
| BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); | |||
| // F_R = F_R + F_L * E_TR | |||
| BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); | |||
| // F_L = F_L + C_R * B_TR | |||
| BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); | |||
| // F_L = F_L + F_R * E_TR | |||
| BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); | |||
| } | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,169 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *, | |||
| const int *, const int *, const double *, const int *, const double *, | |||
| const int *, double *, const int *, double *, int *); | |||
| /** DTRSYL solves the real Sylvester matrix equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dtrsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html | |||
| * */ | |||
| void RELAPACK_dtrsyl( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, double *scale, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int notransA = LAPACK(lsame)(tranA, "N"); | |||
| const int transA = LAPACK(lsame)(tranA, "T"); | |||
| const int ctransA = LAPACK(lsame)(tranA, "C"); | |||
| const int notransB = LAPACK(lsame)(tranB, "N"); | |||
| const int transB = LAPACK(lsame)(tranB, "T"); | |||
| const int ctransB = LAPACK(lsame)(tranB, "C"); | |||
| *info = 0; | |||
| if (!transA && !ctransA && !notransA) | |||
| *info = -1; | |||
| else if (!transB && !ctransB && !notransB) | |||
| *info = -2; | |||
| else if (*isgn != 1 && *isgn != -1) | |||
| *info = -3; | |||
| else if (*m < 0) | |||
| *info = -4; | |||
| else if (*n < 0) | |||
| *info = -5; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -7; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -9; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -11; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DTRSYL", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); | |||
| const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); | |||
| // Recursive kernel | |||
| RELAPACK_dtrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| /** dtrsyl's recursive compute kernel */ | |||
| static void RELAPACK_dtrsyl_rec( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, double *scale, | |||
| int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) { | |||
| // Unblocked | |||
| RELAPACK_dtrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| const double MSGN[] = { -*isgn }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| double scale1[] = { 1. }; | |||
| double scale2[] = { 1. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| int m1 = DREC_SPLIT(*m); | |||
| if (A[m1 + *ldA * (m1 - 1)]) | |||
| m1++; | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const double *const A_TL = A; | |||
| const double *const A_TR = A + *ldA * m1; | |||
| const double *const A_BR = A + *ldA * m1 + m1; | |||
| // C_T | |||
| // C_B | |||
| double *const C_T = C; | |||
| double *const C_B = C + m1; | |||
| if (*tranA == 'N') { | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| } else { | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); | |||
| // C_B = C_B - A_TR' * C_T | |||
| BLAS(dgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); | |||
| } | |||
| } else { | |||
| // Splitting | |||
| int n1 = DREC_SPLIT(*n); | |||
| if (B[n1 + *ldB * (n1 - 1)]) | |||
| n1++; | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const double *const B_TL = B; | |||
| const double *const B_TR = B + *ldB * n1; | |||
| const double *const B_BR = B + *ldB * n1 + n1; | |||
| // C_L C_R | |||
| double *const C_L = C; | |||
| double *const C_R = C + *ldC * n1; | |||
| if (*tranB == 'N') { | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); | |||
| // C_R = C_R -/+ C_L * B_TR | |||
| BLAS(dgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| } else { | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); | |||
| // C_L = C_L -/+ C_R * B_TR' | |||
| BLAS(dgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,107 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dtrtri_rec(const char *, const char *, const int *, | |||
| double *, const int *, int *); | |||
| /** DTRTRI computes the inverse of a real upper or lower triangular matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dtrtri. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html | |||
| * */ | |||
| void RELAPACK_dtrtri( | |||
| const char *uplo, const char *diag, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int nounit = LAPACK(lsame)(diag, "N"); | |||
| const int unit = LAPACK(lsame)(diag, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (!nounit && !unit) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DTRTRI", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleandiag = nounit ? 'N' : 'U'; | |||
| // check for singularity | |||
| if (nounit) { | |||
| int i; | |||
| for (i = 0; i < *n; i++) | |||
| if (A[i + *ldA * i] == 0) { | |||
| *info = i; | |||
| return; | |||
| } | |||
| } | |||
| // Recursive kernel | |||
| RELAPACK_dtrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); | |||
| } | |||
| /** dtrtri's recursive compute kernel */ | |||
| static void RELAPACK_dtrtri_rec( | |||
| const char *uplo, const char *diag, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_DTRTRI, 1)) { | |||
| // Unblocked | |||
| LAPACK(dtrti2)(uplo, diag, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_dtrtri_rec(uplo, diag, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = - A_BL * A_TL | |||
| BLAS(dtrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BL = A_BR \ A_BL | |||
| BLAS(dtrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TR = - A_TL * A_TR | |||
| BLAS(dtrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); | |||
| // A_TR = A_TR / A_BR | |||
| BLAS(dtrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_dtrtri_rec(uplo, diag, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,109 @@ | |||
| #include "stdlib.h" | |||
| #include "stdio.h" | |||
| #include "signal.h" | |||
| #include "f2c.h" | |||
| #ifndef SIGIOT | |||
| #ifdef SIGABRT | |||
| #define SIGIOT SIGABRT | |||
| #endif | |||
| #endif | |||
| void sig_die(const char *s, int kill) { | |||
| /* print error message, then clear buffers */ | |||
| fprintf(stderr, "%s\n", s); | |||
| if(kill) { | |||
| fflush(stderr); | |||
| /* now get a core */ | |||
| signal(SIGIOT, SIG_DFL); | |||
| abort(); | |||
| } else | |||
| exit(1); | |||
| } | |||
| void c_div(complex *c, complex *a, complex *b) { | |||
| double ratio, den; | |||
| double abr, abi, cr; | |||
| if( (abr = b->r) < 0.) | |||
| abr = - abr; | |||
| if( (abi = b->i) < 0.) | |||
| abi = - abi; | |||
| if( abr <= abi ) { | |||
| if(abi == 0) { | |||
| #ifdef IEEE_COMPLEX_DIVIDE | |||
| float af, bf; | |||
| af = bf = abr; | |||
| if (a->i != 0 || a->r != 0) | |||
| af = 1.; | |||
| c->i = c->r = af / bf; | |||
| return; | |||
| #else | |||
| sig_die("complex division by zero", 1); | |||
| #endif | |||
| } | |||
| ratio = (double)b->r / b->i ; | |||
| den = b->i * (1 + ratio*ratio); | |||
| cr = (a->r*ratio + a->i) / den; | |||
| c->i = (a->i*ratio - a->r) / den; | |||
| } else { | |||
| ratio = (double)b->i / b->r ; | |||
| den = b->r * (1 + ratio*ratio); | |||
| cr = (a->r + a->i*ratio) / den; | |||
| c->i = (a->i - a->r*ratio) / den; | |||
| } | |||
| c->r = cr; | |||
| } | |||
| void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) { | |||
| double ratio, den; | |||
| double abr, abi, cr; | |||
| if( (abr = b->r) < 0.) | |||
| abr = - abr; | |||
| if( (abi = b->i) < 0.) | |||
| abi = - abi; | |||
| if( abr <= abi ) { | |||
| if(abi == 0) { | |||
| #ifdef IEEE_COMPLEX_DIVIDE | |||
| if (a->i != 0 || a->r != 0) | |||
| abi = 1.; | |||
| c->i = c->r = abi / abr; | |||
| return; | |||
| #else | |||
| sig_die("complex division by zero", 1); | |||
| #endif | |||
| } | |||
| ratio = b->r / b->i ; | |||
| den = b->i * (1 + ratio*ratio); | |||
| cr = (a->r*ratio + a->i) / den; | |||
| c->i = (a->i*ratio - a->r) / den; | |||
| } else { | |||
| ratio = b->i / b->r ; | |||
| den = b->r * (1 + ratio*ratio); | |||
| cr = (a->r + a->i*ratio) / den; | |||
| c->i = (a->i - a->r*ratio) / den; | |||
| } | |||
| c->r = cr; | |||
| } | |||
| float r_imag(complex *z) { | |||
| return z->i; | |||
| } | |||
| void r_cnjg(complex *r, complex *z) { | |||
| float zi = z->i; | |||
| r->r = z->r; | |||
| r->i = -zi; | |||
| } | |||
| double d_imag(doublecomplex *z) { | |||
| return z->i; | |||
| } | |||
| void d_cnjg(doublecomplex *r, doublecomplex *z) { | |||
| double zi = z->i; | |||
| r->r = z->r; | |||
| r->i = -zi; | |||
| } | |||
| @@ -0,0 +1,223 @@ | |||
| /* f2c.h -- Standard Fortran to C header file */ | |||
| /** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." | |||
| - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ | |||
| #ifndef F2C_INCLUDE | |||
| #define F2C_INCLUDE | |||
| typedef long int integer; | |||
| typedef unsigned long int uinteger; | |||
| typedef char *address; | |||
| typedef short int shortint; | |||
| typedef float real; | |||
| typedef double doublereal; | |||
| typedef struct { real r, i; } complex; | |||
| typedef struct { doublereal r, i; } doublecomplex; | |||
| typedef long int logical; | |||
| typedef short int shortlogical; | |||
| typedef char logical1; | |||
| typedef char integer1; | |||
| #ifdef INTEGER_STAR_8 /* Adjust for integer*8. */ | |||
| typedef long long longint; /* system-dependent */ | |||
| typedef unsigned long long ulongint; /* system-dependent */ | |||
| #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) | |||
| #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) | |||
| #endif | |||
| #define TRUE_ (1) | |||
| #define FALSE_ (0) | |||
| /* Extern is for use with -E */ | |||
| #ifndef Extern | |||
| #define Extern extern | |||
| #endif | |||
| /* I/O stuff */ | |||
| #ifdef f2c_i2 | |||
| /* for -i2 */ | |||
| typedef short flag; | |||
| typedef short ftnlen; | |||
| typedef short ftnint; | |||
| #else | |||
| typedef long int flag; | |||
| typedef long int ftnlen; | |||
| typedef long int ftnint; | |||
| #endif | |||
| /*external read, write*/ | |||
| typedef struct | |||
| { flag cierr; | |||
| ftnint ciunit; | |||
| flag ciend; | |||
| char *cifmt; | |||
| ftnint cirec; | |||
| } cilist; | |||
| /*internal read, write*/ | |||
| typedef struct | |||
| { flag icierr; | |||
| char *iciunit; | |||
| flag iciend; | |||
| char *icifmt; | |||
| ftnint icirlen; | |||
| ftnint icirnum; | |||
| } icilist; | |||
| /*open*/ | |||
| typedef struct | |||
| { flag oerr; | |||
| ftnint ounit; | |||
| char *ofnm; | |||
| ftnlen ofnmlen; | |||
| char *osta; | |||
| char *oacc; | |||
| char *ofm; | |||
| ftnint orl; | |||
| char *oblnk; | |||
| } olist; | |||
| /*close*/ | |||
| typedef struct | |||
| { flag cerr; | |||
| ftnint cunit; | |||
| char *csta; | |||
| } cllist; | |||
| /*rewind, backspace, endfile*/ | |||
| typedef struct | |||
| { flag aerr; | |||
| ftnint aunit; | |||
| } alist; | |||
| /* inquire */ | |||
| typedef struct | |||
| { flag inerr; | |||
| ftnint inunit; | |||
| char *infile; | |||
| ftnlen infilen; | |||
| ftnint *inex; /*parameters in standard's order*/ | |||
| ftnint *inopen; | |||
| ftnint *innum; | |||
| ftnint *innamed; | |||
| char *inname; | |||
| ftnlen innamlen; | |||
| char *inacc; | |||
| ftnlen inacclen; | |||
| char *inseq; | |||
| ftnlen inseqlen; | |||
| char *indir; | |||
| ftnlen indirlen; | |||
| char *infmt; | |||
| ftnlen infmtlen; | |||
| char *inform; | |||
| ftnint informlen; | |||
| char *inunf; | |||
| ftnlen inunflen; | |||
| ftnint *inrecl; | |||
| ftnint *innrec; | |||
| char *inblank; | |||
| ftnlen inblanklen; | |||
| } inlist; | |||
| #define VOID void | |||
| union Multitype { /* for multiple entry points */ | |||
| integer1 g; | |||
| shortint h; | |||
| integer i; | |||
| /* longint j; */ | |||
| real r; | |||
| doublereal d; | |||
| complex c; | |||
| doublecomplex z; | |||
| }; | |||
| typedef union Multitype Multitype; | |||
| /*typedef long int Long;*/ /* No longer used; formerly in Namelist */ | |||
| struct Vardesc { /* for Namelist */ | |||
| char *name; | |||
| char *addr; | |||
| ftnlen *dims; | |||
| int type; | |||
| }; | |||
| typedef struct Vardesc Vardesc; | |||
| struct Namelist { | |||
| char *name; | |||
| Vardesc **vars; | |||
| int nvars; | |||
| }; | |||
| typedef struct Namelist Namelist; | |||
| #define abs(x) ((x) >= 0 ? (x) : -(x)) | |||
| #define dabs(x) (doublereal)abs(x) | |||
| #define min(a,b) ((a) <= (b) ? (a) : (b)) | |||
| #define max(a,b) ((a) >= (b) ? (a) : (b)) | |||
| #define dmin(a,b) (doublereal)min(a,b) | |||
| #define dmax(a,b) (doublereal)max(a,b) | |||
| #define bit_test(a,b) ((a) >> (b) & 1) | |||
| #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) | |||
| #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) | |||
| /* procedure parameter types for -A and -C++ */ | |||
| #define F2C_proc_par_types 1 | |||
| #ifdef __cplusplus | |||
| typedef int /* Unknown procedure type */ (*U_fp)(...); | |||
| typedef shortint (*J_fp)(...); | |||
| typedef integer (*I_fp)(...); | |||
| typedef real (*R_fp)(...); | |||
| typedef doublereal (*D_fp)(...), (*E_fp)(...); | |||
| typedef /* Complex */ VOID (*C_fp)(...); | |||
| typedef /* Double Complex */ VOID (*Z_fp)(...); | |||
| typedef logical (*L_fp)(...); | |||
| typedef shortlogical (*K_fp)(...); | |||
| typedef /* Character */ VOID (*H_fp)(...); | |||
| typedef /* Subroutine */ int (*S_fp)(...); | |||
| #else | |||
| typedef int /* Unknown procedure type */ (*U_fp)(); | |||
| typedef shortint (*J_fp)(); | |||
| typedef integer (*I_fp)(); | |||
| typedef real (*R_fp)(); | |||
| typedef doublereal (*D_fp)(), (*E_fp)(); | |||
| typedef /* Complex */ VOID (*C_fp)(); | |||
| typedef /* Double Complex */ VOID (*Z_fp)(); | |||
| typedef logical (*L_fp)(); | |||
| typedef shortlogical (*K_fp)(); | |||
| typedef /* Character */ VOID (*H_fp)(); | |||
| typedef /* Subroutine */ int (*S_fp)(); | |||
| #endif | |||
| /* E_fp is for real functions when -R is not specified */ | |||
| typedef VOID C_f; /* complex function */ | |||
| typedef VOID H_f; /* character function */ | |||
| typedef VOID Z_f; /* double complex function */ | |||
| typedef doublereal E_f; /* real function with -R not specified */ | |||
| /* undef any lower-case symbols that your C compiler predefines, e.g.: */ | |||
| #ifndef Skip_f2c_Undefs | |||
| #undef cray | |||
| #undef gcos | |||
| #undef mc68010 | |||
| #undef mc68020 | |||
| #undef mips | |||
| #undef pdp11 | |||
| #undef sgi | |||
| #undef sparc | |||
| #undef sun | |||
| #undef sun2 | |||
| #undef sun3 | |||
| #undef sun4 | |||
| #undef u370 | |||
| #undef u3b | |||
| #undef u3b2 | |||
| #undef u3b5 | |||
| #undef unix | |||
| #undef vax | |||
| #endif | |||
| #endif | |||
| @@ -0,0 +1,80 @@ | |||
| #ifndef LAPACK_H | |||
| #define LAPACK_H | |||
| extern int LAPACK(lsame)(const char *, const char *); | |||
| extern int LAPACK(xerbla)(const char *, const int *); | |||
| extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); | |||
| extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); | |||
| extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); | |||
| extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); | |||
| extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); | |||
| extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); | |||
| extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); | |||
| extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); | |||
| extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); | |||
| extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); | |||
| extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); | |||
| extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); | |||
| extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
| extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
| extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
| extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
| extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *); | |||
| extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *); | |||
| extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
| extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
| extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *); | |||
| extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *); | |||
| extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *); | |||
| extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *); | |||
| #endif /* LAPACK_H */ | |||
| @@ -0,0 +1,607 @@ | |||
| #include "relapack.h" | |||
| //////////// | |||
| // XLAUUM // | |||
| //////////// | |||
| #if INCLUDE_SLAUUM | |||
| void LAPACK(slauum)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_slauum(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DLAUUM | |||
| void LAPACK(dlauum)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dlauum(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CLAUUM | |||
| void LAPACK(clauum)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_clauum(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZLAUUM | |||
| void LAPACK(zlauum)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zlauum(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XSYGST // | |||
| //////////// | |||
| #if INCLUDE_SSYGST | |||
| void LAPACK(ssygst)( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DSYGST | |||
| void LAPACK(dsygst)( | |||
| const int *itype, const char *uplo, const int *n, | |||
| double *A, const int *ldA, const double *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CHEGST | |||
| void LAPACK(chegst)( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZHEGST | |||
| void LAPACK(zhegst)( | |||
| const int *itype, const char *uplo, const int *n, | |||
| double *A, const int *ldA, const double *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XTRTRI // | |||
| //////////// | |||
| #if INCLUDE_STRTRI | |||
| void LAPACK(strtri)( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_strtri(uplo, diag, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DTRTRI | |||
| void LAPACK(dtrtri)( | |||
| const char *uplo, const char *diag, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CTRTRI | |||
| void LAPACK(ctrtri)( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZTRTRI | |||
| void LAPACK(ztrtri)( | |||
| const char *uplo, const char *diag, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XPOTRF // | |||
| //////////// | |||
| #if INCLUDE_SPOTRF | |||
| void LAPACK(spotrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_spotrf(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DPOTRF | |||
| void LAPACK(dpotrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dpotrf(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CPOTRF | |||
| void LAPACK(cpotrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_cpotrf(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZPOTRF | |||
| void LAPACK(zpotrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zpotrf(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XPBTRF // | |||
| //////////// | |||
| #if INCLUDE_SPBTRF | |||
| void LAPACK(spbtrf)( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DPBTRF | |||
| void LAPACK(dpbtrf)( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CPBTRF | |||
| void LAPACK(cpbtrf)( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZPBTRF | |||
| void LAPACK(zpbtrf)( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XSYTRF // | |||
| //////////// | |||
| #if INCLUDE_SSYTRF | |||
| void LAPACK(ssytrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DSYTRF | |||
| void LAPACK(dsytrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CSYTRF | |||
| void LAPACK(csytrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZSYTRF | |||
| void LAPACK(zsytrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CHETRF | |||
| void LAPACK(chetrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZHETRF | |||
| void LAPACK(zhetrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_SSYTRF_ROOK | |||
| void LAPACK(ssytrf_rook)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DSYTRF_ROOK | |||
| void LAPACK(dsytrf_rook)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CSYTRF_ROOK | |||
| void LAPACK(csytrf_rook)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZSYTRF_ROOK | |||
| void LAPACK(zsytrf_rook)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CHETRF_ROOK | |||
| void LAPACK(chetrf_rook)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZHETRF_ROOK | |||
| void LAPACK(zhetrf_rook)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XGETRF // | |||
| //////////// | |||
| #if INCLUDE_SGETRF | |||
| void LAPACK(sgetrf)( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DGETRF | |||
| void LAPACK(dgetrf)( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CGETRF | |||
| void LAPACK(cgetrf)( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZGETRF | |||
| void LAPACK(zgetrf)( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XGBTRF // | |||
| //////////// | |||
| #if INCLUDE_SGBTRF | |||
| void LAPACK(sgbtrf)( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DGBTRF | |||
| void LAPACK(dgbtrf)( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CGBTRF | |||
| void LAPACK(cgbtrf)( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZGBTRF | |||
| void LAPACK(zgbtrf)( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XTRSYL // | |||
| //////////// | |||
| #if INCLUDE_STRSYL | |||
| void LAPACK(strsyl)( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DTRSYL | |||
| void LAPACK(dtrsyl)( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, double *scale, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CTRSYL | |||
| void LAPACK(ctrsyl)( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZTRSYL | |||
| void LAPACK(ztrsyl)( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, double *scale, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XTGSYL // | |||
| //////////// | |||
| #if INCLUDE_STGSYL | |||
| void LAPACK(stgsyl)( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dif, | |||
| float *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DTGSYL | |||
| void LAPACK(dtgsyl)( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, | |||
| const double *D, const int *ldD, const double *E, const int *ldE, | |||
| double *F, const int *ldF, | |||
| double *scale, double *dif, | |||
| double *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CTGSYL | |||
| void LAPACK(ctgsyl)( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dif, | |||
| float *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZTGSYL | |||
| void LAPACK(ztgsyl)( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, | |||
| const double *D, const int *ldD, const double *E, const int *ldE, | |||
| double *F, const int *ldF, | |||
| double *scale, double *dif, | |||
| double *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XGEMMT // | |||
| //////////// | |||
| #if INCLUDE_SGEMMT | |||
| void LAPACK(sgemmt)( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| RELAPACK_sgemmt(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DGEMMT | |||
| void LAPACK(dgemmt)( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| RELAPACK_dgemmt(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CGEMMT | |||
| void LAPACK(cgemmt)( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| RELAPACK_cgemmt(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZGEMMT | |||
| void LAPACK(zgemmt)( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| RELAPACK_zgemmt(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,607 @@ | |||
| #include "relapack.h" | |||
| //////////// | |||
| // XLAUUM // | |||
| //////////// | |||
| #if INCLUDE_SLAUUM | |||
| void LAPACK(slauum)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_slauum(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DLAUUM | |||
| void LAPACK(dlauum)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dlauum(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CLAUUM | |||
| void LAPACK(clauum)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_clauum(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZLAUUM | |||
| void LAPACK(zlauum)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zlauum(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XSYGST // | |||
| //////////// | |||
| #if INCLUDE_SSYGST | |||
| void LAPACK(ssygst)( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DSYGST | |||
| void LAPACK(dsygst)( | |||
| const int *itype, const char *uplo, const int *n, | |||
| double *A, const int *ldA, const double *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CSYGST | |||
| void LAPACK(csygst)( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZSYGST | |||
| void LAPACK(zsygst)( | |||
| const int *itype, const char *uplo, const int *n, | |||
| double *A, const int *ldA, const double *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XTRTRI // | |||
| //////////// | |||
| #if INCLUDE_STRTRI | |||
| void LAPACK(strtri)( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_strtri(uplo, diag, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DTRTRI | |||
| void LAPACK(dtrtri)( | |||
| const char *uplo, const char *diag, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CTRTRI | |||
| void LAPACK(ctrtri)( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZTRTRI | |||
| void LAPACK(ztrtri)( | |||
| const char *uplo, const char *diag, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XPOTRF // | |||
| //////////// | |||
| #if INCLUDE_SPOTRF | |||
| void LAPACK(spotrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_spotrf(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DPOTRF | |||
| void LAPACK(dpotrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dpotrf(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CPOTRF | |||
| void LAPACK(cpotrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_cpotrf(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZPOTRF | |||
| void LAPACK(zpotrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zpotrf(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XPBTRF // | |||
| //////////// | |||
| #if INCLUDE_SPBTRF | |||
| void LAPACK(spbtrf)( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DPBTRF | |||
| void LAPACK(dpbtrf)( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CPBTRF | |||
| void LAPACK(cpbtrf)( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZPBTRF | |||
| void LAPACK(zpbtrf)( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XSYTRF // | |||
| //////////// | |||
| #if INCLUDE_SSYTRF | |||
| void LAPACK(ssytrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DSYTRF | |||
| void LAPACK(dsytrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CSYTRF | |||
| void LAPACK(csytrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZSYTRF | |||
| void LAPACK(zsytrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CHETRF | |||
| void LAPACK(chetrf)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZHETRF | |||
| void LAPACK(zhetrf)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_SSYTRF_ROOK | |||
| void LAPACK(ssytrf_rook)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DSYTRF_ROOK | |||
| void LAPACK(dsytrf_rook)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CSYTRF_ROOK | |||
| void LAPACK(csytrf_rook)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZSYTRF_ROOK | |||
| void LAPACK(zsytrf_rook)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CHETRF_ROOK | |||
| void LAPACK(chetrf_rook)( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZHETRF_ROOK | |||
| void LAPACK(zhetrf_rook)( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XGETRF // | |||
| //////////// | |||
| #if INCLUDE_SGETRF | |||
| void LAPACK(sgetrf)( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DGETRF | |||
| void LAPACK(dgetrf)( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CGETRF | |||
| void LAPACK(cgetrf)( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZGETRF | |||
| void LAPACK(zgetrf)( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XGBTRF // | |||
| //////////// | |||
| #if INCLUDE_SGBTRF | |||
| void LAPACK(sgbtrf)( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DGBTRF | |||
| void LAPACK(dgbtrf)( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CGBTRF | |||
| void LAPACK(cgbtrf)( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZGBTRF | |||
| void LAPACK(zgbtrf)( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XTRSYL // | |||
| //////////// | |||
| #if INCLUDE_STRSYL | |||
| void LAPACK(strsyl)( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DTRSYL | |||
| void LAPACK(dtrsyl)( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, double *scale, | |||
| int *info | |||
| ) { | |||
| RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CTRSYL | |||
| void LAPACK(ctrsyl)( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZTRSYL | |||
| void LAPACK(ztrsyl)( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, double *scale, | |||
| int *info | |||
| ) { | |||
| RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XTGSYL // | |||
| //////////// | |||
| #if INCLUDE_STGSYL | |||
| void LAPACK(stgsyl)( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dif, | |||
| float *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DTGSYL | |||
| void LAPACK(dtgsyl)( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, | |||
| const double *D, const int *ldD, const double *E, const int *ldE, | |||
| double *F, const int *ldF, | |||
| double *scale, double *dif, | |||
| double *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CTGSYL | |||
| void LAPACK(ctgsyl)( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dif, | |||
| float *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZTGSYL | |||
| void LAPACK(ztgsyl)( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, | |||
| const double *D, const int *ldD, const double *E, const int *ldE, | |||
| double *F, const int *ldF, | |||
| double *scale, double *dif, | |||
| double *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); | |||
| } | |||
| #endif | |||
| //////////// | |||
| // XGEMMT // | |||
| //////////// | |||
| #if INCLUDE_SGEMMT | |||
| void LAPACK(sgemmt)( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| RELAPACK_sgemmt(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_DGEMMT | |||
| void LAPACK(dgemmt)( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| RELAPACK_dgemmt(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_CGEMMT | |||
| void LAPACK(cgemmt)( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| RELAPACK_cgemmt(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| #if INCLUDE_ZGEMMT | |||
| void LAPACK(zgemmt)( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| RELAPACK_zgemmt(uplo, n, A, ldA, info); | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,60 @@ | |||
| #ifndef RELAPACK_INT_H | |||
| #define RELAPACK_INT_H | |||
| #include "../config.h" | |||
| #include "../inc/relapack.h" | |||
| // add an underscore to BLAS routines (or not) | |||
| #if BLAS_UNDERSCORE | |||
| #define BLAS(routine) routine ## _ | |||
| #else | |||
| #define BLAS(routine) routine | |||
| #endif | |||
| // add an underscore to LAPACK routines (or not) | |||
| #if LAPACK_UNDERSCORE | |||
| #define LAPACK(routine) routine ## _ | |||
| #else | |||
| #define LAPACK(routine) routine | |||
| #endif | |||
| // minimum and maximum macros | |||
| #define MAX(a, b) ((a) > (b) ? (a) : (b)) | |||
| #define MIN(a, b) ((a) < (b) ? (a) : (b)) | |||
| // REC_SPLIT(n) returns how a problem of size n is split recursively. | |||
| // If n >= 16, we ensure that the size of at least one of the halves is | |||
| // divisible by 8 (the cache line size in most CPUs), while both halves are | |||
| // still as close as possible in size. | |||
| // If n < 16 the problem is simply split in the middle. (Note that the | |||
| // crossoversize is usually larger than 16.) | |||
| #define SREC_SPLIT(n) ((n >= 32) ? ((n + 16) / 32) * 16 : n / 2) | |||
| #define DREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2) | |||
| #define CREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2) | |||
| #define ZREC_SPLIT(n) ((n >= 8) ? ((n + 4) / 8) * 4 : n / 2) | |||
| #include "lapack.h" | |||
| #include "blas.h" | |||
| // sytrf helper routines | |||
| void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); | |||
| // trsyl helper routines | |||
| void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
| void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
| void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
| void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
| #endif /* RELAPACK_INT_H */ | |||
| @@ -0,0 +1,227 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *, | |||
| const int *, float *, const int *, int *, float *, const int *, float *, | |||
| const int *, int *); | |||
| /** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's sgbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_sgbtrf( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kl < 0) | |||
| *info = -3; | |||
| else if (*ku < 0) | |||
| *info = -4; | |||
| else if (*ldAb < 2 * *kl + *ku + 1) | |||
| *info = -6; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("SGBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Constant | |||
| const float ZERO[] = { 0. }; | |||
| // Result upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskewg A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| float *const A = Ab + kv; | |||
| // Zero upper diagonal fill-in elements | |||
| int i, j; | |||
| for (j = 0; j < *n; j++) { | |||
| float *const A_j = A + *ldA * j; | |||
| for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
| A_j[i] = 0.; | |||
| } | |||
| // Allocate work space | |||
| const int n1 = SREC_SPLIT(*n); | |||
| const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
| const int nWorkl = (kv > n1) ? n1 : kv; | |||
| const int mWorku = (*kl > n1) ? n1 : *kl; | |||
| const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
| float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); | |||
| float *Worku = malloc(mWorku * nWorku * sizeof(float)); | |||
| LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
| LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
| // Recursive kernel | |||
| RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
| // Free work space | |||
| free(Workl); | |||
| free(Worku); | |||
| } | |||
| /** sgbtrf's recursive compute kernel */ | |||
| static void RELAPACK_sgbtrf_rec( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterators | |||
| int i, j; | |||
| // Output upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| float *const A = Ab + kv; | |||
| // Splitting | |||
| const int n1 = MIN(SREC_SPLIT(*n), *kl); | |||
| const int n2 = *n - n1; | |||
| const int m1 = MIN(n1, *m); | |||
| const int m2 = *m - m1; | |||
| const int mn1 = MIN(m1, n1); | |||
| const int mn2 = MIN(m2, n2); | |||
| // Ab_L * | |||
| // Ab_BR | |||
| float *const Ab_L = Ab; | |||
| float *const Ab_BR = Ab + *ldAb * n1; | |||
| // A_L A_R | |||
| float *const A_L = A; | |||
| float *const A_R = A + *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + *ldA * n1; | |||
| float *const A_BL = A + m1; | |||
| float *const A_BR = A + *ldA * n1 + m1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, kv - n1); | |||
| const int n22 = MIN(n2 - n21, n1); | |||
| const int m21 = MIN(m2, *kl - m1); | |||
| const int m22 = MIN(m2 - m21, m1); | |||
| // n1 n21 n22 | |||
| // m * A_Rl ARr | |||
| float *const A_Rl = A_R; | |||
| float *const A_Rr = A_R + *ldA * n21; | |||
| // n1 n21 n22 | |||
| // m1 * A_TRl A_TRr | |||
| // m21 A_BLt A_BRtl A_BRtr | |||
| // m22 A_BLb A_BRbl A_BRbr | |||
| float *const A_TRl = A_TR; | |||
| float *const A_TRr = A_TR + *ldA * n21; | |||
| float *const A_BLt = A_BL; | |||
| float *const A_BLb = A_BL + m21; | |||
| float *const A_BRtl = A_BR; | |||
| float *const A_BRtr = A_BR + *ldA * n21; | |||
| float *const A_BRbl = A_BR + m21; | |||
| float *const A_BRbr = A_BR + *ldA * n21 + m21; | |||
| // recursion(Ab_L, ipiv_T) | |||
| RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
| // Workl = A_BLb | |||
| LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
| // partially redo swaps in A_L | |||
| for (i = 0; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
| else | |||
| BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
| } | |||
| } | |||
| // apply pivots to A_Rl | |||
| LAPACK(slaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
| // apply pivots to A_Rr columnwise | |||
| for (j = 0; j < n22; j++) { | |||
| float *const A_Rrj = A_Rr + *ldA * j; | |||
| for (i = j; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| const float tmp = A_Rrj[i]; | |||
| A_Rrj[i] = A_Rr[ip]; | |||
| A_Rrj[ip] = tmp; | |||
| } | |||
| } | |||
| } | |||
| // A_TRl = A_TL \ A_TRl | |||
| BLAS(strsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // Worku = A_TRr | |||
| LAPACK(slacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
| // Worku = A_TL \ Worku | |||
| BLAS(strsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
| // A_TRr = Worku | |||
| LAPACK(slacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_TRl | |||
| BLAS(sgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // A_BRbl = A_BRbl - Workl * A_TRl | |||
| BLAS(sgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRtr = A_BRtr - A_BLt * Worku | |||
| BLAS(sgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Workl * Worku | |||
| BLAS(sgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
| // partially undo swaps in A_L | |||
| for (i = mn1 - 1; i >= 0; i--) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
| else | |||
| BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
| } | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| for (i = 0; i < mn2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,165 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_sgemmt_rec(const char *, const char *, const char *, | |||
| const int *, const int *, const float *, const float *, const int *, | |||
| const float *, const int *, const float *, float *, const int *); | |||
| static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, | |||
| const int *, const int *, const float *, const float *, const int *, | |||
| const float *, const int *, const float *, float *, const int *); | |||
| /** SGEMMT computes a matrix-matrix product with general matrices but updates | |||
| * only the upper or lower triangular part of the result matrix. | |||
| * | |||
| * This routine performs the same operation as the BLAS routine | |||
| * sgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
| * but only updates the triangular part of C specified by uplo: | |||
| * If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
| * otherwise the upper triangular part is updated. | |||
| * */ | |||
| void RELAPACK_sgemmt( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| #if HAVE_XGEMMT | |||
| BLAS(sgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| #else | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int notransA = LAPACK(lsame)(transA, "N"); | |||
| const int tranA = LAPACK(lsame)(transA, "T"); | |||
| const int notransB = LAPACK(lsame)(transB, "N"); | |||
| const int tranB = LAPACK(lsame)(transB, "T"); | |||
| int info = 0; | |||
| if (!lower && !upper) | |||
| info = 1; | |||
| else if (!tranA && !notransA) | |||
| info = 2; | |||
| else if (!tranB && !notransB) | |||
| info = 3; | |||
| else if (*n < 0) | |||
| info = 4; | |||
| else if (*k < 0) | |||
| info = 5; | |||
| else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
| info = 8; | |||
| else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
| info = 10; | |||
| else if (*ldC < MAX(1, *n)) | |||
| info = 13; | |||
| if (info) { | |||
| LAPACK(xerbla)("SGEMMT", &info); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleantransA = notransA ? 'N' : 'T'; | |||
| const char cleantransB = notransB ? 'N' : 'T'; | |||
| // Recursive kernel | |||
| RELAPACK_sgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| #endif | |||
| } | |||
| /** sgemmt's recursive compute kernel */ | |||
| static void RELAPACK_sgemmt_rec( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_SGEMMT, 1)) { | |||
| // Unblocked | |||
| RELAPACK_sgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| } | |||
| // Splitting | |||
| const int n1 = SREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_T | |||
| // A_B | |||
| const float *const A_T = A; | |||
| const float *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1); | |||
| // B_L B_R | |||
| const float *const B_L = B; | |||
| const float *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1); | |||
| // C_TL C_TR | |||
| // C_BL C_BR | |||
| float *const C_TL = C; | |||
| float *const C_TR = C + *ldC * n1; | |||
| float *const C_BL = C + n1; | |||
| float *const C_BR = C + *ldC * n1 + n1; | |||
| // recursion(C_TL) | |||
| RELAPACK_sgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
| if (*uplo == 'L') | |||
| // C_BL = alpha A_B B_L + beta C_BL | |||
| BLAS(sgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
| else | |||
| // C_TR = alpha A_T B_R + beta C_TR | |||
| BLAS(sgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
| // recursion(C_BR) | |||
| RELAPACK_sgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
| } | |||
| /** sgemmt's unblocked compute kernel */ | |||
| static void RELAPACK_sgemmt_rec2( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| const int incB = (*transB == 'N') ? 1 : *ldB; | |||
| const int incC = 1; | |||
| int i; | |||
| for (i = 0; i < *n; i++) { | |||
| // A_0 | |||
| // A_i | |||
| const float *const A_0 = A; | |||
| const float *const A_i = A + ((*transA == 'N') ? i : *ldA * i); | |||
| // * B_i * | |||
| const float *const B_i = B + ((*transB == 'N') ? *ldB * i : i); | |||
| // * C_0i * | |||
| // * C_ii * | |||
| float *const C_0i = C + *ldC * i; | |||
| float *const C_ii = C + *ldC * i + i; | |||
| if (*uplo == 'L') { | |||
| const int nmi = *n - i; | |||
| if (*transA == 'N') | |||
| BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| else | |||
| BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| } else { | |||
| const int ip1 = i + 1; | |||
| if (*transA == 'N') | |||
| BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| else | |||
| BLAS(sgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *, | |||
| int *, int *); | |||
| /** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's sgetrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html | |||
| * */ | |||
| void RELAPACK_sgetrf( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("SGETRF", &minfo); | |||
| return; | |||
| } | |||
| const int sn = MIN(*m, *n); | |||
| RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
| // Right remainder | |||
| if (*m < *n) { | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const int iONE[] = { 1. }; | |||
| // Splitting | |||
| const int rn = *n - *m; | |||
| // A_L A_R | |||
| const float *const A_L = A; | |||
| float *const A_R = A + *ldA * *m; | |||
| // A_R = apply(ipiv, A_R) | |||
| LAPACK(slaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
| // A_R = A_L \ A_R | |||
| BLAS(strsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
| } | |||
| } | |||
| /** sgetrf's recursive compute kernel */ | |||
| static void RELAPACK_sgetrf_rec( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_SGETRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Splitting | |||
| const int n1 = SREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| const int m2 = *m - n1; | |||
| // A_L A_R | |||
| float *const A_L = A; | |||
| float *const A_R = A + *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + *ldA * n1; | |||
| float *const A_BL = A + n1; | |||
| float *const A_BR = A + *ldA * n1 + n1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // recursion(A_L, ipiv_T) | |||
| RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
| // apply pivots to A_R | |||
| LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
| // A_TR = A_TL \ A_TR | |||
| BLAS(strsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_BL * A_TR | |||
| BLAS(sgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
| // recursion(A_BR, ipiv_B) | |||
| RELAPACK_sgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // apply pivots to A_BL | |||
| LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,87 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_slauum_rec(const char *, const int *, float *, | |||
| const int *, int *); | |||
| /** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's slauum. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html | |||
| * */ | |||
| void RELAPACK_slauum( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("SLAUUM", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_slauum_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** slauum's recursive compute kernel */ | |||
| static void RELAPACK_slauum_rec( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_SLAUUM, 1)) { | |||
| // Unblocked | |||
| LAPACK(slauu2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| // Splitting | |||
| const int n1 = SREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + *ldA * n1; | |||
| float *const A_BL = A + n1; | |||
| float *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_slauum_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*uplo == 'L') { | |||
| // A_TL = A_TL + A_BL' * A_BL | |||
| BLAS(ssyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
| // A_BL = A_BR' * A_BL | |||
| BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TL = A_TL + A_TR * A_TR' | |||
| BLAS(ssyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
| // A_TR = A_TR * A_BR' | |||
| BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_slauum_rec(uplo, &n2, A_BR, ldA, info); | |||
| } | |||
| @@ -0,0 +1,157 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_spbtrf_rec(const char *, const int *, const int *, | |||
| float *, const int *, float *, const int *, int *); | |||
| /** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's spbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_spbtrf( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kd < 0) | |||
| *info = -3; | |||
| else if (*ldAb < *kd + 1) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("SPBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Constant | |||
| const float ZERO[] = { 0. }; | |||
| // Allocate work space | |||
| const int n1 = SREC_SPLIT(*n); | |||
| const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
| const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
| float *Work = malloc(mWork * nWork * sizeof(float)); | |||
| LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
| // Recursive kernel | |||
| RELAPACK_spbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
| // Free work space | |||
| free(Work); | |||
| } | |||
| /** spbtrf's recursive compute kernel */ | |||
| static void RELAPACK_spbtrf_rec( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| float *Work, const int *ldWork, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_SPBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| float *const A = Ab + ((*uplo == 'L') ? 0 : *kd); | |||
| // Splitting | |||
| const int n1 = MIN(SREC_SPLIT(*n), *kd); | |||
| const int n2 = *n - n1; | |||
| // * * | |||
| // * Ab_BR | |||
| float *const Ab_BR = Ab + *ldAb * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + *ldA * n1; | |||
| float *const A_BL = A + n1; | |||
| float *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_spotrf(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, *kd - n1); | |||
| const int n22 = MIN(n2 - n21, *kd); | |||
| // n1 n21 n22 | |||
| // n1 * A_TRl A_TRr | |||
| // n21 A_BLt A_BRtl A_BRtr | |||
| // n22 A_BLb A_BRbl A_BRbr | |||
| float *const A_TRl = A_TR; | |||
| float *const A_TRr = A_TR + *ldA * n21; | |||
| float *const A_BLt = A_BL; | |||
| float *const A_BLb = A_BL + n21; | |||
| float *const A_BRtl = A_BR; | |||
| float *const A_BRtr = A_BR + *ldA * n21; | |||
| float *const A_BRbl = A_BR + n21; | |||
| float *const A_BRbr = A_BR + *ldA * n21 + n21; | |||
| if (*uplo == 'L') { | |||
| // A_BLt = ABLt / A_TL' | |||
| BLAS(strsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
| BLAS(ssyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_BLb | |||
| LAPACK(slacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
| // Work = Work / A_TL' | |||
| BLAS(strsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRbl = A_BRbl - Work * A_BLt' | |||
| BLAS(sgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRbr = A_BRbr - Work * Work' | |||
| BLAS(ssyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_BLb = Work | |||
| LAPACK(slacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
| } else { | |||
| // A_TRl = A_TL' \ A_TRl | |||
| BLAS(strsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
| BLAS(ssyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_TRr | |||
| LAPACK(slacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
| // Work = A_TL' \ Work | |||
| BLAS(strsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRtr = A_BRtr - A_TRl' * Work | |||
| BLAS(sgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Work' * Work | |||
| BLAS(ssyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_TRr = Work | |||
| LAPACK(slacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| if (*kd > n1) | |||
| RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info); | |||
| else | |||
| RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,92 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_spotrf_rec(const char *, const int *, float *, | |||
| const int *, int *); | |||
| /** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's spotrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html | |||
| * */ | |||
| void RELAPACK_spotrf( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("SPOTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_spotrf_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** spotrf's recursive compute kernel */ | |||
| static void RELAPACK_spotrf_rec( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_SPOTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(spotf2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| // Splitting | |||
| const int n1 = SREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + *ldA * n1; | |||
| float *const A_BL = A + n1; | |||
| float *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_spotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / A_TL' | |||
| BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * A_BL' | |||
| BLAS(ssyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
| } else { | |||
| // A_TR = A_TL' \ A_TR | |||
| BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * A_TR | |||
| BLAS(ssyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_spotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,212 @@ | |||
| #include "relapack.h" | |||
| #if XSYGST_ALLOW_MALLOC | |||
| #include "stdlib.h" | |||
| #endif | |||
| static void RELAPACK_ssygst_rec(const int *, const char *, const int *, | |||
| float *, const int *, const float *, const int *, | |||
| float *, const int *, int *); | |||
| /** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ssygst. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html | |||
| * */ | |||
| void RELAPACK_ssygst( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (*itype < 1 || *itype > 3) | |||
| *info = -1; | |||
| else if (!lower && !upper) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -7; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("SSYGST", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Allocate work space | |||
| float *Work = NULL; | |||
| int lWork = 0; | |||
| #if XSYGST_ALLOW_MALLOC | |||
| const int n1 = SREC_SPLIT(*n); | |||
| lWork = n1 * (*n - n1); | |||
| Work = malloc(lWork * sizeof(float)); | |||
| if (!Work) | |||
| lWork = 0; | |||
| #endif | |||
| // Recursive kernel | |||
| RELAPACK_ssygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); | |||
| // Free work space | |||
| #if XSYGST_ALLOW_MALLOC | |||
| if (Work) | |||
| free(Work); | |||
| #endif | |||
| } | |||
| /** ssygst's recursive compute kernel */ | |||
| static void RELAPACK_ssygst_rec( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_SSYGST, 1)) { | |||
| // Unblocked | |||
| LAPACK(ssygs2)(itype, uplo, n, A, ldA, B, ldB, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ZERO[] = { 0. }; | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| const float HALF[] = { .5 }; | |||
| const float MHALF[] = { -.5 }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| // Splitting | |||
| const int n1 = SREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + *ldA * n1; | |||
| float *const A_BL = A + n1; | |||
| float *const A_BR = A + *ldA * n1 + n1; | |||
| // B_TL B_TR | |||
| // B_BL B_BR | |||
| const float *const B_TL = B; | |||
| const float *const B_TR = B + *ldB * n1; | |||
| const float *const B_BL = B + n1; | |||
| const float *const B_BR = B + *ldB * n1 + n1; | |||
| // recursion(A_TL, B_TL) | |||
| RELAPACK_ssygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); | |||
| if (*itype == 1) | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / B_TL' | |||
| BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = -1/2 * B_BL * A_TL | |||
| BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' | |||
| BLAS(ssyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR \ A_BL | |||
| BLAS(strsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL' \ A_TR | |||
| BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = -1/2 * A_TL * B_TR | |||
| BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR | |||
| BLAS(ssyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR / B_BR | |||
| BLAS(strsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| else | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL * B_TL | |||
| BLAS(strmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = 1/2 * A_BR * B_BL | |||
| BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL | |||
| BLAS(ssyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR * A_BL | |||
| BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL * A_TR | |||
| BLAS(strmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = 1/2 * B_TR * A_BR | |||
| BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR + 1/2 B_TR A_BR | |||
| BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' | |||
| BLAS(ssyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR + 1/2 B_TR * A_BR | |||
| BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR * B_BR | |||
| BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR, B_BR) | |||
| RELAPACK_ssygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); | |||
| } | |||
| @@ -0,0 +1,238 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ssytrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html | |||
| * */ | |||
| void RELAPACK_ssytrf( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("SSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy arguments | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** ssytrf's recursive compute kernel */ | |||
| static void RELAPACK_ssytrf_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_SSYTRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_ssytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = SREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + n1; | |||
| float *const A_BR = A + *ldA * n1 + n1; | |||
| float *const A_BL_B = A + *n; | |||
| float *const A_BR_B = A + *ldA * n1 + *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + n1; | |||
| float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = SREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + *ldA * n_rest; | |||
| float *const A_TR_T = A + *ldA * (n_rest + n1); | |||
| float *const A_TL = A + *ldA * n_rest + n_rest; | |||
| float *const A_TR = A + *ldA * (n_rest + n1) + n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,351 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static int c__1 = 1; | |||
| static float c_b8 = -1.f; | |||
| static float c_b9 = 1.f; | |||
| /** SSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's slasyf. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int * | |||
| nb, int *kb, float *a, int *lda, int *ipiv, float *w, | |||
| int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; | |||
| float r__1, r__2, r__3; | |||
| /* Builtin functions */ | |||
| double sqrt(double); | |||
| /* Local variables */ | |||
| static int j, k; | |||
| static float t, r1, d11, d21, d22; | |||
| static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
| static float alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int sscal_(int *, float *, float *, int *), | |||
| sgemv_(char *, int *, int *, float *, float *, int *, | |||
| float *, int *, float *, float *, int *, ftnlen); | |||
| static int kstep; | |||
| extern /* Subroutine */ int scopy_(int *, float *, int *, float *, | |||
| int *), sswap_(int *, float *, int *, float *, int * | |||
| ); | |||
| static float absakk; | |||
| extern int isamax_(int *, float *, int *); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| kstep = 1; | |||
| absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| i__1 = k - imax; | |||
| jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
| &c__1); | |||
| rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1)); | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| jmax = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| /* Computing MAX */ | |||
| r__2 = rowmax, r__3 = (r__1 = w[jmax + (kw - 1) * w_dim1], | |||
| dabs(r__1)); | |||
| rowmax = dmax(r__2,r__3); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else if ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) >= | |||
| alpha * rowmax) { | |||
| kp = imax; | |||
| scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kp != kk) { | |||
| a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; | |||
| i__1 = kk - 1 - kp; | |||
| scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| scopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| sswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| r1 = 1.f / a[k + k * a_dim1]; | |||
| i__1 = k - 1; | |||
| sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (k > 2) { | |||
| d21 = w[k - 1 + kw * w_dim1]; | |||
| d11 = w[k + kw * w_dim1] / d21; | |||
| d22 = w[k - 1 + (kw - 1) * w_dim1] / d21; | |||
| t = 1.f / (d11 * d22 - 1.f); | |||
| d21 = t / d21; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1) | |||
| * w_dim1] - w[j + kw * w_dim1]); | |||
| a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] - | |||
| w[j + (kw - 1) * w_dim1]); | |||
| /* L20: */ | |||
| } | |||
| } | |||
| a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; | |||
| a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; | |||
| a[k + k * a_dim1] = w[k + kw * w_dim1]; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| ++j; | |||
| } | |||
| ++j; | |||
| if (jp != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| sswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
| } | |||
| if (j < *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| i__1 = *n - k + 1; | |||
| scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k | |||
| + w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
| kstep = 1; | |||
| absakk = (r__1 = w[k + k * w_dim1], dabs(r__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - k; | |||
| scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], | |||
| lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) * | |||
| w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
| ; | |||
| rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1)); | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| jmax = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| /* Computing MAX */ | |||
| r__2 = rowmax, r__3 = (r__1 = w[jmax + (k + 1) * w_dim1], | |||
| dabs(r__1)); | |||
| rowmax = dmax(r__2,r__3); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else if ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) >= | |||
| alpha * rowmax) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kp != kk) { | |||
| a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; | |||
| i__1 = kp - kk - 1; | |||
| scopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| sswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| r1 = 1.f / a[k + k * a_dim1]; | |||
| i__1 = *n - k; | |||
| sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| d21 = w[k + 1 + k * w_dim1]; | |||
| d11 = w[k + 1 + (k + 1) * w_dim1] / d21; | |||
| d22 = w[k + k * w_dim1] / d21; | |||
| t = 1.f / (d11 * d22 - 1.f); | |||
| d21 = t / d21; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] - | |||
| w[j + (k + 1) * w_dim1]); | |||
| a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) * | |||
| w_dim1] - w[j + k * w_dim1]); | |||
| /* L80: */ | |||
| } | |||
| } | |||
| a[k + k * a_dim1] = w[k + k * w_dim1]; | |||
| a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; | |||
| a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| --j; | |||
| } | |||
| --j; | |||
| if (jp != jj && j >= 1) { | |||
| sswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j > 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ssytrf_rook. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html | |||
| * */ | |||
| void RELAPACK_ssytrf_rook( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("SSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** ssytrf_rook's recursive compute kernel */ | |||
| static void RELAPACK_ssytrf_rook_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_ssytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = SREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + n1; | |||
| float *const A_BR = A + *ldA * n1 + n1; | |||
| float *const A_BL_B = A + *n; | |||
| float *const A_BR_B = A + *ldA * n1 + *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + n1; | |||
| float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = SREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + *ldA * n_rest; | |||
| float *const A_TR_T = A + *ldA * (n_rest + n1); | |||
| float *const A_TL = A + *ldA * n_rest + n_rest; | |||
| float *const A_TR = A + *ldA * (n_rest + n1) + n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,451 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static int c__1 = 1; | |||
| static float c_b9 = -1.f; | |||
| static float c_b10 = 1.f; | |||
| /** SSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's slasyf_rook. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n, | |||
| int *nb, int *kb, float *a, int *lda, int *ipiv, float * | |||
| w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; | |||
| float r__1; | |||
| /* Builtin functions */ | |||
| double sqrt(double); | |||
| /* Local variables */ | |||
| static int j, k, p; | |||
| static float t, r1, d11, d12, d21, d22; | |||
| static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
| static logical done; | |||
| static int imax, jmax; | |||
| static float alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int sscal_(int *, float *, float *, int *); | |||
| static float sfmin; | |||
| static int itemp; | |||
| extern /* Subroutine */ int sgemv_(char *, int *, int *, float *, | |||
| float *, int *, float *, int *, float *, float *, int *, | |||
| ftnlen); | |||
| static int kstep; | |||
| static float stemp; | |||
| extern /* Subroutine */ int scopy_(int *, float *, int *, float *, | |||
| int *), sswap_(int *, float *, int *, float *, int * | |||
| ); | |||
| static float absakk; | |||
| extern double slamch_(char *, ftnlen); | |||
| extern int isamax_(int *, float *, int *); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| sfmin = slamch_("S", (ftnlen)1); | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L12: | |||
| scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = k - imax; | |||
| jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
| w_dim1], &c__1); | |||
| rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| itemp = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| stemp = (r__1 = w[itemp + (kw - 1) * w_dim1], dabs(r__1)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| if (! ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) < | |||
| alpha * rowmax)) { | |||
| kp = imax; | |||
| scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L12; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = k - p; | |||
| scopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
| a_dim1], lda); | |||
| scopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - k + 1; | |||
| sswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| sswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
| ldw); | |||
| } | |||
| if (kp != kk) { | |||
| a[kp + k * a_dim1] = a[kk + k * a_dim1]; | |||
| i__1 = k - 1 - kp; | |||
| scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| scopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - kk + 1; | |||
| sswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) { | |||
| r1 = 1.f / a[k + k * a_dim1]; | |||
| i__1 = k - 1; | |||
| sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else if (a[k + k * a_dim1] != 0.f) { | |||
| i__1 = k - 1; | |||
| for (ii = 1; ii <= i__1; ++ii) { | |||
| a[ii + k * a_dim1] /= a[k + k * a_dim1]; | |||
| /* L14: */ | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| d12 = w[k - 1 + kw * w_dim1]; | |||
| d11 = w[k + kw * w_dim1] / d12; | |||
| d22 = w[k - 1 + (kw - 1) * w_dim1] / d12; | |||
| t = 1.f / (d11 * d22 - 1.f); | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) * | |||
| w_dim1] - w[j + kw * w_dim1]) / d12); | |||
| a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] - | |||
| w[j + (kw - 1) * w_dim1]) / d12); | |||
| /* L20: */ | |||
| } | |||
| } | |||
| a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; | |||
| a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; | |||
| a[k + k * a_dim1] = w[k + kw * w_dim1]; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| ++j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| ++j; | |||
| if (jp2 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| sswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| jj = j - 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| i__1 = *n - j + 1; | |||
| sswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| if (j <= *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| i__1 = *n - k + 1; | |||
| scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, & | |||
| w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| absakk = (r__1 = w[k + k * w_dim1], dabs(r__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = *n - k + 1; | |||
| scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L72: | |||
| i__1 = imax - k; | |||
| scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1] | |||
| , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k | |||
| + 1) * w_dim1], &c__1, (ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
| c__1); | |||
| rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| itemp = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| stemp = (r__1 = w[itemp + (k + 1) * w_dim1], dabs(r__1)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| if (! ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) < | |||
| alpha * rowmax)) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| i__1 = *n - k + 1; | |||
| scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L72; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p - k; | |||
| scopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - p + 1; | |||
| scopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & | |||
| c__1); | |||
| sswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
| sswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
| } | |||
| if (kp != kk) { | |||
| a[kp + k * a_dim1] = a[kk + k * a_dim1]; | |||
| i__1 = kp - k - 1; | |||
| scopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) | |||
| * a_dim1], lda); | |||
| i__1 = *n - kp + 1; | |||
| scopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * | |||
| a_dim1], &c__1); | |||
| sswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) { | |||
| r1 = 1.f / a[k + k * a_dim1]; | |||
| i__1 = *n - k; | |||
| sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } else if (a[k + k * a_dim1] != 0.f) { | |||
| i__1 = *n; | |||
| for (ii = k + 1; ii <= i__1; ++ii) { | |||
| a[ii + k * a_dim1] /= a[k + k * a_dim1]; | |||
| /* L74: */ | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| d21 = w[k + 1 + k * w_dim1]; | |||
| d11 = w[k + 1 + (k + 1) * w_dim1] / d21; | |||
| d22 = w[k + k * w_dim1] / d21; | |||
| t = 1.f / (d11 * d22 - 1.f); | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[ | |||
| j + (k + 1) * w_dim1]) / d21); | |||
| a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) * | |||
| w_dim1] - w[j + k * w_dim1]) / d21); | |||
| /* L80: */ | |||
| } | |||
| } | |||
| a[k + k * a_dim1] = w[k + k * w_dim1]; | |||
| a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; | |||
| a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| --j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| --j; | |||
| if (jp2 != jj && j >= 1) { | |||
| sswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| jj = j + 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| sswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j >= 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,274 @@ | |||
| #include "relapack.h" | |||
| #include <math.h> | |||
| static void RELAPACK_stgsyl_rec(const char *, const int *, const int *, | |||
| const int *, const float *, const int *, const float *, const int *, | |||
| float *, const int *, const float *, const int *, const float *, | |||
| const int *, float *, const int *, float *, float *, float *, int *, int *, | |||
| int *); | |||
| /** STGSYL solves the generalized Sylvester equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's stgsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html | |||
| * */ | |||
| void RELAPACK_stgsyl( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dif, | |||
| float *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| // Parse arguments | |||
| const int notran = LAPACK(lsame)(trans, "N"); | |||
| const int tran = LAPACK(lsame)(trans, "T"); | |||
| // Compute work buffer size | |||
| int lwmin = 1; | |||
| if (notran && (*ijob == 1 || *ijob == 2)) | |||
| lwmin = MAX(1, 2 * *m * *n); | |||
| *info = 0; | |||
| // Check arguments | |||
| if (!tran && !notran) | |||
| *info = -1; | |||
| else if (notran && (*ijob < 0 || *ijob > 4)) | |||
| *info = -2; | |||
| else if (*m <= 0) | |||
| *info = -3; | |||
| else if (*n <= 0) | |||
| *info = -4; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -6; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -8; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -10; | |||
| else if (*ldD < MAX(1, *m)) | |||
| *info = -12; | |||
| else if (*ldE < MAX(1, *n)) | |||
| *info = -14; | |||
| else if (*ldF < MAX(1, *m)) | |||
| *info = -16; | |||
| else if (*lWork < lwmin && *lWork != -1) | |||
| *info = -20; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("STGSYL", &minfo); | |||
| return; | |||
| } | |||
| if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = lwmin; | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantrans = notran ? 'N' : 'T'; | |||
| // Constant | |||
| const float ZERO[] = { 0. }; | |||
| int isolve = 1; | |||
| int ifunc = 0; | |||
| if (notran) { | |||
| if (*ijob >= 3) { | |||
| ifunc = *ijob - 2; | |||
| LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else if (*ijob >= 1) | |||
| isolve = 2; | |||
| } | |||
| float scale2; | |||
| int iround; | |||
| for (iround = 1; iround <= isolve; iround++) { | |||
| *scale = 1; | |||
| float dscale = 0; | |||
| float dsum = 1; | |||
| int pq; | |||
| RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); | |||
| if (dscale != 0) { | |||
| if (*ijob == 1 || *ijob == 3) | |||
| *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); | |||
| else | |||
| *dif = sqrt(pq) / (dscale * sqrt(dsum)); | |||
| } | |||
| if (isolve == 2) { | |||
| if (iround == 1) { | |||
| if (notran) | |||
| ifunc = *ijob; | |||
| scale2 = *scale; | |||
| LAPACK(slacpy)("F", m, n, C, ldC, Work, m); | |||
| LAPACK(slacpy)("F", m, n, F, ldF, Work + *m * *n, m); | |||
| LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else { | |||
| LAPACK(slacpy)("F", m, n, Work, m, C, ldC); | |||
| LAPACK(slacpy)("F", m, n, Work + *m * *n, m, F, ldF); | |||
| *scale = scale2; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| /** stgsyl's recursive vompute kernel */ | |||
| static void RELAPACK_stgsyl_rec( | |||
| const char *trans, const int *ifunc, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dsum, float *dscale, | |||
| int *iWork, int *pq, int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) { | |||
| // Unblocked | |||
| LAPACK(stgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| float scale1[] = { 1. }; | |||
| float scale2[] = { 1. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| int m1 = SREC_SPLIT(*m); | |||
| if (A[m1 + *ldA * (m1 - 1)]) | |||
| m1++; | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const float *const A_TL = A; | |||
| const float *const A_TR = A + *ldA * m1; | |||
| const float *const A_BR = A + *ldA * m1 + m1; | |||
| // C_T | |||
| // C_B | |||
| float *const C_T = C; | |||
| float *const C_B = C + m1; | |||
| // D_TL D_TR | |||
| // 0 D_BR | |||
| const float *const D_TL = D; | |||
| const float *const D_TR = D + *ldD * m1; | |||
| const float *const D_BR = D + *ldD * m1 + m1; | |||
| // F_T | |||
| // F_B | |||
| float *const F_T = F; | |||
| float *const F_B = F + m1; | |||
| if (*trans == 'N') { | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // F_T = F_T - D_TR * C_B | |||
| BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); | |||
| // C_B = C_B - A_TR^H * C_T | |||
| BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // C_B = C_B - D_TR^H * F_T | |||
| BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); | |||
| } | |||
| } | |||
| } else { | |||
| // Splitting | |||
| int n1 = SREC_SPLIT(*n); | |||
| if (B[n1 + *ldB * (n1 - 1)]) | |||
| n1++; | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const float *const B_TL = B; | |||
| const float *const B_TR = B + *ldB * n1; | |||
| const float *const B_BR = B + *ldB * n1 + n1; | |||
| // C_L C_R | |||
| float *const C_L = C; | |||
| float *const C_R = C + *ldC * n1; | |||
| // E_TL E_TR | |||
| // 0 E_BR | |||
| const float *const E_TL = E; | |||
| const float *const E_TR = E + *ldE * n1; | |||
| const float *const E_BR = E + *ldE * n1 + n1; | |||
| // F_L F_R | |||
| float *const F_L = F; | |||
| float *const F_R = F + *ldF * n1; | |||
| if (*trans == 'N') { | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
| // C_R = C_R + F_L * B_TR | |||
| BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); | |||
| // F_R = F_R + F_L * E_TR | |||
| BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); | |||
| // F_L = F_L + C_R * B_TR | |||
| BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); | |||
| // F_L = F_L + F_R * E_TR | |||
| BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); | |||
| } | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,169 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_strsyl_rec(const char *, const char *, const int *, | |||
| const int *, const int *, const float *, const int *, const float *, | |||
| const int *, float *, const int *, float *, int *); | |||
| /** STRSYL solves the real Sylvester matrix equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's strsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html | |||
| * */ | |||
| void RELAPACK_strsyl( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int notransA = LAPACK(lsame)(tranA, "N"); | |||
| const int transA = LAPACK(lsame)(tranA, "T"); | |||
| const int ctransA = LAPACK(lsame)(tranA, "C"); | |||
| const int notransB = LAPACK(lsame)(tranB, "N"); | |||
| const int transB = LAPACK(lsame)(tranB, "T"); | |||
| const int ctransB = LAPACK(lsame)(tranB, "C"); | |||
| *info = 0; | |||
| if (!transA && !ctransA && !notransA) | |||
| *info = -1; | |||
| else if (!transB && !ctransB && !notransB) | |||
| *info = -2; | |||
| else if (*isgn != 1 && *isgn != -1) | |||
| *info = -3; | |||
| else if (*m < 0) | |||
| *info = -4; | |||
| else if (*n < 0) | |||
| *info = -5; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -7; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -9; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -11; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("STRSYL", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); | |||
| const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); | |||
| // Recursive kernel | |||
| RELAPACK_strsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| /** strsyl's recursive compute kernel */ | |||
| static void RELAPACK_strsyl_rec( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) { | |||
| // Unblocked | |||
| RELAPACK_strsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| const float MSGN[] = { -*isgn }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| float scale1[] = { 1. }; | |||
| float scale2[] = { 1. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| int m1 = SREC_SPLIT(*m); | |||
| if (A[m1 + *ldA * (m1 - 1)]) | |||
| m1++; | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const float *const A_TL = A; | |||
| const float *const A_TR = A + *ldA * m1; | |||
| const float *const A_BR = A + *ldA * m1 + m1; | |||
| // C_T | |||
| // C_B | |||
| float *const C_T = C; | |||
| float *const C_B = C + m1; | |||
| if (*tranA == 'N') { | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| } else { | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); | |||
| // C_B = C_B - A_TR' * C_T | |||
| BLAS(sgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); | |||
| } | |||
| } else { | |||
| // Splitting | |||
| int n1 = SREC_SPLIT(*n); | |||
| if (B[n1 + *ldB * (n1 - 1)]) | |||
| n1++; | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const float *const B_TL = B; | |||
| const float *const B_TR = B + *ldB * n1; | |||
| const float *const B_BR = B + *ldB * n1 + n1; | |||
| // C_L C_R | |||
| float *const C_L = C; | |||
| float *const C_R = C + *ldC * n1; | |||
| if (*tranB == 'N') { | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); | |||
| // C_R = C_R -/+ C_L * B_TR | |||
| BLAS(sgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| } else { | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); | |||
| // C_L = C_L -/+ C_R * B_TR' | |||
| BLAS(sgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,107 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_strtri_rec(const char *, const char *, const int *, | |||
| float *, const int *, int *); | |||
| /** CTRTRI computes the inverse of a real upper or lower triangular matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's strtri. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html | |||
| * */ | |||
| void RELAPACK_strtri( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int nounit = LAPACK(lsame)(diag, "N"); | |||
| const int unit = LAPACK(lsame)(diag, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (!nounit && !unit) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("STRTRI", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleandiag = nounit ? 'N' : 'U'; | |||
| // check for singularity | |||
| if (nounit) { | |||
| int i; | |||
| for (i = 0; i < *n; i++) | |||
| if (A[i + *ldA * i] == 0) { | |||
| *info = i; | |||
| return; | |||
| } | |||
| } | |||
| // Recursive kernel | |||
| RELAPACK_strtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); | |||
| } | |||
| /** strtri's recursive compute kernel */ | |||
| static void RELAPACK_strtri_rec( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_STRTRI, 1)) { | |||
| // Unblocked | |||
| LAPACK(strti2)(uplo, diag, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1. }; | |||
| const float MONE[] = { -1. }; | |||
| // Splitting | |||
| const int n1 = SREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + *ldA * n1; | |||
| float *const A_BL = A + n1; | |||
| float *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_strtri_rec(uplo, diag, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = - A_BL * A_TL | |||
| BLAS(strmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BL = A_BR \ A_BL | |||
| BLAS(strsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TR = - A_TL * A_TR | |||
| BLAS(strmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); | |||
| // A_TR = A_TR / A_BR | |||
| BLAS(strsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_strtri_rec(uplo, diag, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,230 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *, | |||
| const int *, double *, const int *, int *, double *, const int *, double *, | |||
| const int *, int *); | |||
| /** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zgbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_zgbtrf( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kl < 0) | |||
| *info = -3; | |||
| else if (*ku < 0) | |||
| *info = -4; | |||
| else if (*ldAb < 2 * *kl + *ku + 1) | |||
| *info = -6; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZGBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Constant | |||
| const double ZERO[] = { 0., 0. }; | |||
| // Result upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| double *const A = Ab + 2 * kv; | |||
| // Zero upper diagonal fill-in elements | |||
| int i, j; | |||
| for (j = 0; j < *n; j++) { | |||
| double *const A_j = A + 2 * *ldA * j; | |||
| for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
| A_j[2 * i] = A_j[2 * i + 1] = 0.; | |||
| } | |||
| // Allocate work space | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
| const int nWorkl = (kv > n1) ? n1 : kv; | |||
| const int mWorku = (*kl > n1) ? n1 : *kl; | |||
| const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
| double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); | |||
| double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); | |||
| LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
| LAPACK(zlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
| // Recursive kernel | |||
| RELAPACK_zgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
| // Free work space | |||
| free(Workl); | |||
| free(Worku); | |||
| } | |||
| /** zgbtrf's recursive compute kernel */ | |||
| static void RELAPACK_zgbtrf_rec( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterators | |||
| int i, j; | |||
| // Output upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| double *const A = Ab + 2 * kv; | |||
| // Splitting | |||
| const int n1 = MIN(ZREC_SPLIT(*n), *kl); | |||
| const int n2 = *n - n1; | |||
| const int m1 = MIN(n1, *m); | |||
| const int m2 = *m - m1; | |||
| const int mn1 = MIN(m1, n1); | |||
| const int mn2 = MIN(m2, n2); | |||
| // Ab_L * | |||
| // Ab_BR | |||
| double *const Ab_L = Ab; | |||
| double *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
| // A_L A_R | |||
| double *const A_L = A; | |||
| double *const A_R = A + 2 * *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + 2 * *ldA * n1; | |||
| double *const A_BL = A + 2 * m1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * m1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, kv - n1); | |||
| const int n22 = MIN(n2 - n21, n1); | |||
| const int m21 = MIN(m2, *kl - m1); | |||
| const int m22 = MIN(m2 - m21, m1); | |||
| // n1 n21 n22 | |||
| // m * A_Rl ARr | |||
| double *const A_Rl = A_R; | |||
| double *const A_Rr = A_R + 2 * *ldA * n21; | |||
| // n1 n21 n22 | |||
| // m1 * A_TRl A_TRr | |||
| // m21 A_BLt A_BRtl A_BRtr | |||
| // m22 A_BLb A_BRbl A_BRbr | |||
| double *const A_TRl = A_TR; | |||
| double *const A_TRr = A_TR + 2 * *ldA * n21; | |||
| double *const A_BLt = A_BL; | |||
| double *const A_BLb = A_BL + 2 * m21; | |||
| double *const A_BRtl = A_BR; | |||
| double *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
| double *const A_BRbl = A_BR + 2 * m21; | |||
| double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21; | |||
| // recursion(Ab_L, ipiv_T) | |||
| RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
| // Workl = A_BLb | |||
| LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
| // partially redo swaps in A_L | |||
| for (i = 0; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
| else | |||
| BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
| } | |||
| } | |||
| // apply pivots to A_Rl | |||
| LAPACK(zlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
| // apply pivots to A_Rr columnwise | |||
| for (j = 0; j < n22; j++) { | |||
| double *const A_Rrj = A_Rr + 2 * *ldA * j; | |||
| for (i = j; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| const double tmpr = A_Rrj[2 * i]; | |||
| const double tmpc = A_Rrj[2 * i + 1]; | |||
| A_Rrj[2 * i] = A_Rrj[2 * ip]; | |||
| A_Rrj[2 * i + 1] = A_Rrj[2 * ip + 1]; | |||
| A_Rrj[2 * ip] = tmpr; | |||
| A_Rrj[2 * ip + 1] = tmpc; | |||
| } | |||
| } | |||
| } | |||
| // A_TRl = A_TL \ A_TRl | |||
| BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // Worku = A_TRr | |||
| LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
| // Worku = A_TL \ Worku | |||
| BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
| // A_TRr = Worku | |||
| LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_TRl | |||
| BLAS(zgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // A_BRbl = A_BRbl - Workl * A_TRl | |||
| BLAS(zgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRtr = A_BRtr - A_BLt * Worku | |||
| BLAS(zgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Workl * Worku | |||
| BLAS(zgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
| // partially undo swaps in A_L | |||
| for (i = mn1 - 1; i >= 0; i--) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
| else | |||
| BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
| } | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| for (i = 0; i < mn2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,167 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_zgemmt_rec(const char *, const char *, const char *, | |||
| const int *, const int *, const double *, const double *, const int *, | |||
| const double *, const int *, const double *, double *, const int *); | |||
| static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, | |||
| const int *, const int *, const double *, const double *, const int *, | |||
| const double *, const int *, const double *, double *, const int *); | |||
| /** ZGEMMT computes a matrix-matrix product with general matrices but updates | |||
| * only the upper or lower triangular part of the result matrix. | |||
| * | |||
| * This routine performs the same operation as the BLAS routine | |||
| * zgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
| * but only updates the triangular part of C specified by uplo: | |||
| * If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
| * otherwise the upper triangular part is updated. | |||
| * */ | |||
| void RELAPACK_zgemmt( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| #if HAVE_XGEMMT | |||
| BLAS(zgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| #else | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int notransA = LAPACK(lsame)(transA, "N"); | |||
| const int tranA = LAPACK(lsame)(transA, "T"); | |||
| const int ctransA = LAPACK(lsame)(transA, "C"); | |||
| const int notransB = LAPACK(lsame)(transB, "N"); | |||
| const int tranB = LAPACK(lsame)(transB, "T"); | |||
| const int ctransB = LAPACK(lsame)(transB, "C"); | |||
| int info = 0; | |||
| if (!lower && !upper) | |||
| info = 1; | |||
| else if (!tranA && !ctransA && !notransA) | |||
| info = 2; | |||
| else if (!tranB && !ctransB && !notransB) | |||
| info = 3; | |||
| else if (*n < 0) | |||
| info = 4; | |||
| else if (*k < 0) | |||
| info = 5; | |||
| else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
| info = 8; | |||
| else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
| info = 10; | |||
| else if (*ldC < MAX(1, *n)) | |||
| info = 13; | |||
| if (info) { | |||
| LAPACK(xerbla)("ZGEMMT", &info); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C'); | |||
| const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C'); | |||
| // Recursive kernel | |||
| RELAPACK_zgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| #endif | |||
| } | |||
| /** zgemmt's recursive compute kernel */ | |||
| static void RELAPACK_zgemmt_rec( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) { | |||
| // Unblocked | |||
| RELAPACK_zgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| } | |||
| // Splitting | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_T | |||
| // A_B | |||
| const double *const A_T = A; | |||
| const double *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1); | |||
| // B_L B_R | |||
| const double *const B_L = B; | |||
| const double *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1); | |||
| // C_TL C_TR | |||
| // C_BL C_BR | |||
| double *const C_TL = C; | |||
| double *const C_TR = C + 2 * *ldC * n1; | |||
| double *const C_BL = C + 2 * n1; | |||
| double *const C_BR = C + 2 * *ldC * n1 + 2 * n1; | |||
| // recursion(C_TL) | |||
| RELAPACK_zgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
| if (*uplo == 'L') | |||
| // C_BL = alpha A_B B_L + beta C_BL | |||
| BLAS(zgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
| else | |||
| // C_TR = alpha A_T B_R + beta C_TR | |||
| BLAS(zgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
| // recursion(C_BR) | |||
| RELAPACK_zgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
| } | |||
| /** zgemmt's unblocked compute kernel */ | |||
| static void RELAPACK_zgemmt_rec2( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| const int incB = (*transB == 'N') ? 1 : *ldB; | |||
| const int incC = 1; | |||
| int i; | |||
| for (i = 0; i < *n; i++) { | |||
| // A_0 | |||
| // A_i | |||
| const double *const A_0 = A; | |||
| const double *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i); | |||
| // * B_i * | |||
| const double *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i); | |||
| // * C_0i * | |||
| // * C_ii * | |||
| double *const C_0i = C + 2 * *ldC * i; | |||
| double *const C_ii = C + 2 * *ldC * i + 2 * i; | |||
| if (*uplo == 'L') { | |||
| const int nmi = *n - i; | |||
| if (*transA == 'N') | |||
| BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| else | |||
| BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| } else { | |||
| const int ip1 = i + 1; | |||
| if (*transA == 'N') | |||
| BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| else | |||
| BLAS(zgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_zgetrf_rec(const int *, const int *, double *, | |||
| const int *, int *, int *); | |||
| /** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zgetrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html | |||
| * */ | |||
| void RELAPACK_zgetrf( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZGETRF", &minfo); | |||
| return; | |||
| } | |||
| const int sn = MIN(*m, *n); | |||
| RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
| // Right remainder | |||
| if (*m < *n) { | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Splitting | |||
| const int rn = *n - *m; | |||
| // A_L A_R | |||
| const double *const A_L = A; | |||
| double *const A_R = A + 2 * *ldA * *m; | |||
| // A_R = apply(ipiv, A_R) | |||
| LAPACK(zlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
| // A_R = A_L \ A_R | |||
| BLAS(ztrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
| } | |||
| } | |||
| /** zgetrf's recursive compute kernel */ | |||
| static void RELAPACK_zgetrf_rec( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_ZGETRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(zgetf2)(m, n, A, ldA, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1. }; | |||
| // Splitting | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| const int m2 = *m - n1; | |||
| // A_L A_R | |||
| double *const A_L = A; | |||
| double *const A_R = A + 2 * *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + 2 * *ldA * n1; | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // recursion(A_L, ipiv_T) | |||
| RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
| // apply pivots to A_R | |||
| LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
| // A_TR = A_TL \ A_TR | |||
| BLAS(ztrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_BL * A_TR | |||
| BLAS(zgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
| // recursion(A_BR, ipiv_B) | |||
| RELAPACK_zgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // apply pivots to A_BL | |||
| LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,212 @@ | |||
| #include "relapack.h" | |||
| #if XSYGST_ALLOW_MALLOC | |||
| #include "stdlib.h" | |||
| #endif | |||
| static void RELAPACK_zhegst_rec(const int *, const char *, const int *, | |||
| double *, const int *, const double *, const int *, | |||
| double *, const int *, int *); | |||
| /** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zhegst. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html | |||
| * */ | |||
| void RELAPACK_zhegst( | |||
| const int *itype, const char *uplo, const int *n, | |||
| double *A, const int *ldA, const double *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (*itype < 1 || *itype > 3) | |||
| *info = -1; | |||
| else if (!lower && !upper) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -7; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZHEGST", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Allocate work space | |||
| double *Work = NULL; | |||
| int lWork = 0; | |||
| #if XSYGST_ALLOW_MALLOC | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| lWork = n1 * (*n - n1); | |||
| Work = malloc(lWork * 2 * sizeof(double)); | |||
| if (!Work) | |||
| lWork = 0; | |||
| #endif | |||
| // recursive kernel | |||
| RELAPACK_zhegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); | |||
| // Free work space | |||
| #if XSYGST_ALLOW_MALLOC | |||
| if (Work) | |||
| free(Work); | |||
| #endif | |||
| } | |||
| /** zhegst's recursive compute kernel */ | |||
| static void RELAPACK_zhegst_rec( | |||
| const int *itype, const char *uplo, const int *n, | |||
| double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_ZHEGST, 1)) { | |||
| // Unblocked | |||
| LAPACK(zhegs2)(itype, uplo, n, A, ldA, B, ldB, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ZERO[] = { 0., 0. }; | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| const double HALF[] = { .5, 0. }; | |||
| const double MHALF[] = { -.5, 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| // Splitting | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + 2 * *ldA * n1; | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // B_TL B_TR | |||
| // B_BL B_BR | |||
| const double *const B_TL = B; | |||
| const double *const B_TR = B + 2 * *ldB * n1; | |||
| const double *const B_BL = B + 2 * n1; | |||
| const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
| // recursion(A_TL, B_TL) | |||
| RELAPACK_zhegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); | |||
| if (*itype == 1) | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / B_TL' | |||
| BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork >= n2 * n1) { | |||
| // T = -1/2 * B_BL * A_TL | |||
| BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' | |||
| BLAS(zher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); | |||
| if (*lWork >= n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR \ A_BL | |||
| BLAS(ztrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL' \ A_TR | |||
| BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork >= n2 * n1) { | |||
| // T = -1/2 * A_TL * B_TR | |||
| BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR | |||
| BLAS(zher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); | |||
| if (*lWork >= n2 * n1) | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR / B_BR | |||
| BLAS(ztrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| else | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL * B_TL | |||
| BLAS(ztrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork >= n2 * n1) { | |||
| // T = 1/2 * A_BR * B_BL | |||
| BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL | |||
| BLAS(zher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); | |||
| if (*lWork >= n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR * A_BL | |||
| BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL * A_TR | |||
| BLAS(ztrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork >= n2 * n1) { | |||
| // T = 1/2 * B_TR * A_BR | |||
| BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR + 1/2 B_TR A_BR | |||
| BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' | |||
| BLAS(zher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); | |||
| if (*lWork >= n2 * n1) | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR + 1/2 B_TR * A_BR | |||
| BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR * B_BR | |||
| BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR, B_BR) | |||
| RELAPACK_zhegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *, | |||
| double *, const int *, int *, double *, const int *, int *); | |||
| /** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zhetrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html | |||
| * */ | |||
| void RELAPACK_zhetrf( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| double *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(double)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZHETRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** zhetrf's recursive compute kernel */ | |||
| static void RELAPACK_zhetrf_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_ZHETRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_zhetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = ZREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| double *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| double *const A_BL_B = A + 2 * *n; | |||
| double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| double *const Work_BL = Work + 2 * n1; | |||
| double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| double *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| double *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = ZREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| double *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| double *const Work_L = Work; | |||
| double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,524 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static doublecomplex c_b1 = {1.,0.}; | |||
| static int c__1 = 1; | |||
| /** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method | |||
| * | |||
| * This routine is a minor modification of LAPACK's zlahef. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int * | |||
| nb, int *kb, doublecomplex *a, int *lda, int *ipiv, | |||
| doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| double d__1, d__2, d__3, d__4; | |||
| doublecomplex z__1, z__2, z__3, z__4; | |||
| /* Builtin functions */ | |||
| double sqrt(double), d_imag(doublecomplex *); | |||
| void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *, | |||
| doublecomplex *, doublecomplex *); | |||
| /* Local variables */ | |||
| static int j, k; | |||
| static double t, r1; | |||
| static doublecomplex d11, d21, d22; | |||
| static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
| static double alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| static int kstep; | |||
| extern /* Subroutine */ int zgemv_(char *, int *, int *, | |||
| doublecomplex *, doublecomplex *, int *, doublecomplex *, | |||
| int *, doublecomplex *, doublecomplex *, int *, ftnlen), | |||
| zcopy_(int *, doublecomplex *, int *, doublecomplex *, | |||
| int *), zswap_(int *, doublecomplex *, int *, | |||
| doublecomplex *, int *); | |||
| static double absakk; | |||
| extern /* Subroutine */ int zdscal_(int *, double *, | |||
| doublecomplex *, int *); | |||
| static double colmax; | |||
| extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) | |||
| ; | |||
| extern int izamax_(int *, doublecomplex *, int *); | |||
| static double rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.) + 1.) / 8.; | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| i__1 = k - 1; | |||
| zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (d__1 = w[i__1].r, abs(d__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
| kw * w_dim1]), abs(d__2)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - 1; | |||
| zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| i__1 = k - imax; | |||
| zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| i__1 = k - imax; | |||
| zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + (kw - 1) * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| } | |||
| i__1 = k - imax; | |||
| jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
| &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| jmax + (kw - 1) * w_dim1]), abs(d__2)); | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( | |||
| d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs( | |||
| d__2)); | |||
| rowmax = max(d__3,d__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) { | |||
| kp = imax; | |||
| zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| i__1 = kk - 1 - kp; | |||
| zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kk - 1 - kp; | |||
| zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| i__1 = k + k * a_dim1; | |||
| r1 = 1. / a[i__1].r; | |||
| i__1 = k - 1; | |||
| zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| i__1 = k - 1; | |||
| zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| d_cnjg(&z__2, &d21); | |||
| z_div(&z__1, &w[k + kw * w_dim1], &z__2); | |||
| d11.r = z__1.r, d11.i = z__1.i; | |||
| z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
| d22.r = z__1.r, d22.i = z__1.i; | |||
| z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1. / (z__1.r - 1.); | |||
| z__2.r = t, z__2.i = 0.; | |||
| z_div(&z__1, &z__2, &d21); | |||
| d21.r = z__1.r, d21.i = z__1.i; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
| .i; | |||
| z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
| d21.r * z__2.i + d21.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| d_cnjg(&z__2, &d21); | |||
| i__3 = j + kw * w_dim1; | |||
| z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = | |||
| z__2.r * z__3.i + z__2.i * z__3.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1; | |||
| zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = k - 2; | |||
| zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| ++j; | |||
| } | |||
| ++j; | |||
| if (jp != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
| } | |||
| if (j < *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k | |||
| + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (d__1 = w[i__1].r, abs(d__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
| k * w_dim1]), abs(d__2)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - k; | |||
| zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = imax - k; | |||
| zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
| imax + 1 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], | |||
| lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
| w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + (k + 1) * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
| ; | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| jmax + (k + 1) * w_dim1]), abs(d__2)); | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( | |||
| d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs( | |||
| d__2)); | |||
| rowmax = max(d__3,d__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
| k * w_dim1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| i__1 = kp - kk - 1; | |||
| zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kp - kk - 1; | |||
| zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| i__1 = k + k * a_dim1; | |||
| r1 = 1. / a[i__1].r; | |||
| i__1 = *n - k; | |||
| zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| i__1 = *n - k; | |||
| zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = z__1.r, d11.i = z__1.i; | |||
| d_cnjg(&z__2, &d21); | |||
| z_div(&z__1, &w[k + k * w_dim1], &z__2); | |||
| d22.r = z__1.r, d22.i = z__1.i; | |||
| z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1. / (z__1.r - 1.); | |||
| z__2.r = t, z__2.i = 0.; | |||
| z_div(&z__1, &z__2, &d21); | |||
| d21.r = z__1.r, d21.i = z__1.i; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| d_cnjg(&z__2, &d21); | |||
| i__3 = j + k * w_dim1; | |||
| z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = | |||
| z__2.r * z__3.i + z__2.i * z__3.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
| .i; | |||
| z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
| d21.r * z__2.i + d21.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = *n - k; | |||
| zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = *n - k - 1; | |||
| zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| --j; | |||
| } | |||
| --j; | |||
| if (jp != jj && j >= 1) { | |||
| zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j > 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *, | |||
| double *, const int *, int *, double *, const int *, int *); | |||
| /** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zhetrf_rook. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html | |||
| * */ | |||
| void RELAPACK_zhetrf_rook( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| double *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(double)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZHETRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** zhetrf_rook's recursive compute kernel */ | |||
| static void RELAPACK_zhetrf_rook_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_zhetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = ZREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| double *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| double *const A_BL_B = A + 2 * *n; | |||
| double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| double *const Work_BL = Work + 2 * n1; | |||
| double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| double *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| double *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = ZREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| double *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| double *const Work_L = Work; | |||
| double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,662 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static doublecomplex c_b1 = {1.,0.}; | |||
| static int c__1 = 1; | |||
| /** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method | |||
| * | |||
| * This routine is a minor modification of LAPACK's zlahef_rook. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n, | |||
| int *nb, int *kb, doublecomplex *a, int *lda, int * | |||
| ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| double d__1, d__2; | |||
| doublecomplex z__1, z__2, z__3, z__4, z__5; | |||
| /* Builtin functions */ | |||
| double sqrt(double), d_imag(doublecomplex *); | |||
| void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *, | |||
| doublecomplex *, doublecomplex *); | |||
| /* Local variables */ | |||
| static int j, k, p; | |||
| static double t, r1; | |||
| static doublecomplex d11, d21, d22; | |||
| static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
| static logical done; | |||
| static int imax, jmax; | |||
| static double alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| static double dtemp, sfmin; | |||
| static int itemp, kstep; | |||
| extern /* Subroutine */ int zgemv_(char *, int *, int *, | |||
| doublecomplex *, doublecomplex *, int *, doublecomplex *, | |||
| int *, doublecomplex *, doublecomplex *, int *, ftnlen), | |||
| zcopy_(int *, doublecomplex *, int *, doublecomplex *, | |||
| int *), zswap_(int *, doublecomplex *, int *, | |||
| doublecomplex *, int *); | |||
| extern double dlamch_(char *, ftnlen); | |||
| static double absakk; | |||
| extern /* Subroutine */ int zdscal_(int *, double *, | |||
| doublecomplex *, int *); | |||
| static double colmax; | |||
| extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) | |||
| ; | |||
| extern int izamax_(int *, doublecomplex *, int *); | |||
| static double rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.) + 1.) / 8.; | |||
| sfmin = dlamch_("S", (ftnlen)1); | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], & | |||
| c__1); | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (d__1 = w[i__1].r, abs(d__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
| kw * w_dim1]), abs(d__2)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| zcopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], | |||
| &c__1); | |||
| } | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L12: | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| i__1 = k - imax; | |||
| zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| i__1 = k - imax; | |||
| zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + (kw - 1) * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| } | |||
| if (imax != k) { | |||
| i__1 = k - imax; | |||
| jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& | |||
| w[jmax + (kw - 1) * w_dim1]), abs(d__2)); | |||
| } else { | |||
| rowmax = 0.; | |||
| } | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| i__1 = itemp + (kw - 1) * w_dim1; | |||
| dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| itemp + (kw - 1) * w_dim1]), abs(d__2)); | |||
| if (dtemp > rowmax) { | |||
| rowmax = dtemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) { | |||
| kp = imax; | |||
| zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L12; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p + p * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| i__1 = k - 1 - p; | |||
| zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
| a_dim1], lda); | |||
| i__1 = k - 1 - p; | |||
| zlacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda); | |||
| if (p > 1) { | |||
| i__1 = p - 1; | |||
| zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + | |||
| 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| zswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k + | |||
| 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
| ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| i__1 = kk - 1 - kp; | |||
| zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kk - 1 - kp; | |||
| zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| i__1 = k + k * a_dim1; | |||
| t = a[i__1].r; | |||
| if (abs(t) >= sfmin) { | |||
| r1 = 1. / t; | |||
| i__1 = k - 1; | |||
| zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| i__1 = k - 1; | |||
| for (ii = 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| i__3 = ii + k * a_dim1; | |||
| z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L14: */ | |||
| } | |||
| } | |||
| i__1 = k - 1; | |||
| zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| d_cnjg(&z__2, &d21); | |||
| z_div(&z__1, &w[k + kw * w_dim1], &z__2); | |||
| d11.r = z__1.r, d11.i = z__1.i; | |||
| z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
| d22.r = z__1.r, d22.i = z__1.i; | |||
| z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1. / (z__1.r - 1.); | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| z_div(&z__2, &z__3, &d21); | |||
| z__1.r = t * z__2.r, z__1.i = t * z__2.i; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + kw * w_dim1; | |||
| z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| d_cnjg(&z__5, &d21); | |||
| z_div(&z__2, &z__3, &z__5); | |||
| z__1.r = t * z__2.r, z__1.i = t * z__2.i; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1; | |||
| zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = k - 2; | |||
| zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| ++j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| ++j; | |||
| if (jp2 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| ++jj; | |||
| if (kstep == 2 && jp1 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| if (j < *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, & | |||
| w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
| ftnlen)12); | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| } | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (d__1 = w[i__1].r, abs(d__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
| k * w_dim1]), abs(d__2)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| zcopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k * | |||
| a_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L72: | |||
| i__1 = imax - k; | |||
| zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = imax - k; | |||
| zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
| imax + 1 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1] | |||
| , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
| 1) * w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + (k + 1) * w_dim1; | |||
| d__1 = w[i__2].r; | |||
| w[i__1].r = d__1, w[i__1].i = 0.; | |||
| } | |||
| if (imax != k) { | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
| c__1); | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& | |||
| w[jmax + (k + 1) * w_dim1]), abs(d__2)); | |||
| } else { | |||
| rowmax = 0.; | |||
| } | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = itemp + (k + 1) * w_dim1; | |||
| dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| itemp + (k + 1) * w_dim1]), abs(d__2)); | |||
| if (dtemp > rowmax) { | |||
| rowmax = dtemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L72; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p + p * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| i__1 = p - k - 1; | |||
| zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) * | |||
| a_dim1], lda); | |||
| i__1 = p - k - 1; | |||
| zlacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda); | |||
| if (p < *n) { | |||
| i__1 = *n - p; | |||
| zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p | |||
| * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| zswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
| } | |||
| zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| d__1 = a[i__2].r; | |||
| a[i__1].r = d__1, a[i__1].i = 0.; | |||
| i__1 = kp - kk - 1; | |||
| zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kp - kk - 1; | |||
| zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| i__1 = k + k * a_dim1; | |||
| t = a[i__1].r; | |||
| if (abs(t) >= sfmin) { | |||
| r1 = 1. / t; | |||
| i__1 = *n - k; | |||
| zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } else { | |||
| i__1 = *n; | |||
| for (ii = k + 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| i__3 = ii + k * a_dim1; | |||
| z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L74: */ | |||
| } | |||
| } | |||
| i__1 = *n - k; | |||
| zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = z__1.r, d11.i = z__1.i; | |||
| d_cnjg(&z__2, &d21); | |||
| z_div(&z__1, &w[k + k * w_dim1], &z__2); | |||
| d22.r = z__1.r, d22.i = z__1.i; | |||
| z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1. / (z__1.r - 1.); | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + k * w_dim1; | |||
| z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| d_cnjg(&z__5, &d21); | |||
| z_div(&z__2, &z__3, &z__5); | |||
| z__1.r = t * z__2.r, z__1.i = t * z__2.i; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| z_div(&z__2, &z__3, &d21); | |||
| z__1.r = t * z__2.r, z__1.i = t * z__2.i; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = *n - k; | |||
| zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = *n - k - 1; | |||
| zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| --j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| --j; | |||
| if (jp2 != jj && j >= 1) { | |||
| zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| --jj; | |||
| if (kstep == 2 && jp1 != jj && j >= 1) { | |||
| zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j > 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,87 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_zlauum_rec(const char *, const int *, double *, | |||
| const int *, int *); | |||
| /** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zlauum. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html | |||
| * */ | |||
| void RELAPACK_zlauum( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZLAUUM", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_zlauum_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** zlauum's recursive compute kernel */ | |||
| static void RELAPACK_zlauum_rec( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) { | |||
| // Unblocked | |||
| LAPACK(zlauu2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| // Splitting | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + 2 * *ldA * n1; | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_zlauum_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*uplo == 'L') { | |||
| // A_TL = A_TL + A_BL' * A_BL | |||
| BLAS(zherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
| // A_BL = A_BR' * A_BL | |||
| BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TL = A_TL + A_TR * A_TR' | |||
| BLAS(zherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
| // A_TR = A_TR * A_BR' | |||
| BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_zlauum_rec(uplo, &n2, A_BR, ldA, info); | |||
| } | |||
| @@ -0,0 +1,157 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *, | |||
| double *, const int *, double *, const int *, int *); | |||
| /** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zpbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_zpbtrf( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kd < 0) | |||
| *info = -3; | |||
| else if (*ldAb < *kd + 1) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZPBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Constant | |||
| const double ZERO[] = { 0., 0. }; | |||
| // Allocate work space | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
| const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
| double *Work = malloc(mWork * nWork * 2 * sizeof(double)); | |||
| LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
| // Recursive kernel | |||
| RELAPACK_zpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
| // Free work space | |||
| free(Work); | |||
| } | |||
| /** zpbtrf's recursive compute kernel */ | |||
| static void RELAPACK_zpbtrf_rec( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| double *Work, const int *ldWork, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); | |||
| // Splitting | |||
| const int n1 = MIN(ZREC_SPLIT(*n), *kd); | |||
| const int n2 = *n - n1; | |||
| // * * | |||
| // * Ab_BR | |||
| double *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + 2 * *ldA * n1; | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_zpotrf(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, *kd - n1); | |||
| const int n22 = MIN(n2 - n21, *kd); | |||
| // n1 n21 n22 | |||
| // n1 * A_TRl A_TRr | |||
| // n21 A_BLt A_BRtl A_BRtr | |||
| // n22 A_BLb A_BRbl A_BRbr | |||
| double *const A_TRl = A_TR; | |||
| double *const A_TRr = A_TR + 2 * *ldA * n21; | |||
| double *const A_BLt = A_BL; | |||
| double *const A_BLb = A_BL + 2 * n21; | |||
| double *const A_BRtl = A_BR; | |||
| double *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
| double *const A_BRbl = A_BR + 2 * n21; | |||
| double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21; | |||
| if (*uplo == 'L') { | |||
| // A_BLt = ABLt / A_TL' | |||
| BLAS(ztrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
| BLAS(zherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_BLb | |||
| LAPACK(zlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
| // Work = Work / A_TL' | |||
| BLAS(ztrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRbl = A_BRbl - Work * A_BLt' | |||
| BLAS(zgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRbr = A_BRbr - Work * Work' | |||
| BLAS(zherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_BLb = Work | |||
| LAPACK(zlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
| } else { | |||
| // A_TRl = A_TL' \ A_TRl | |||
| BLAS(ztrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
| BLAS(zherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_TRr | |||
| LAPACK(zlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
| // Work = A_TL' \ Work | |||
| BLAS(ztrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRtr = A_BRtr - A_TRl' * Work | |||
| BLAS(zgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Work' * Work | |||
| BLAS(zherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_TRr = Work | |||
| LAPACK(zlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| if (*kd > n1) | |||
| RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info); | |||
| else | |||
| RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,92 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_zpotrf_rec(const char *, const int *, double *, | |||
| const int *, int *); | |||
| /** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zpotrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html | |||
| * */ | |||
| void RELAPACK_zpotrf( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZPOTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_zpotrf_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** zpotrf's recursive compute kernel */ | |||
| static void RELAPACK_zpotrf_rec( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(zpotf2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| // Splitting | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + 2 * *ldA * n1; | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_zpotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / A_TL' | |||
| BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * A_BL' | |||
| BLAS(zherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
| } else { | |||
| // A_TR = A_TL' \ A_TR | |||
| BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * A_TR | |||
| BLAS(zherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_zpotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,238 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *, | |||
| double *, const int *, int *, double *, const int *, int *); | |||
| /** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zsytrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html | |||
| * */ | |||
| void RELAPACK_zsytrf( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| double *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(double)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy arguments | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** zsytrf's recursive compute kernel */ | |||
| static void RELAPACK_zsytrf_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_zsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = ZREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| double *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| double *const A_BL_B = A + 2 * *n; | |||
| double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| double *const Work_BL = Work + 2 * n1; | |||
| double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| double *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| double *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = ZREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| double *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| double *const Work_L = Work; | |||
| double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,452 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static doublecomplex c_b1 = {1.,0.}; | |||
| static int c__1 = 1; | |||
| /** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's zlasyf. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int * | |||
| nb, int *kb, doublecomplex *a, int *lda, int *ipiv, | |||
| doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| double d__1, d__2, d__3, d__4; | |||
| doublecomplex z__1, z__2, z__3; | |||
| /* Builtin functions */ | |||
| double sqrt(double), d_imag(doublecomplex *); | |||
| void z_div(doublecomplex *, doublecomplex *, doublecomplex *); | |||
| /* Local variables */ | |||
| static int j, k; | |||
| static doublecomplex t, r1, d11, d21, d22; | |||
| static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
| static double alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int zscal_(int *, doublecomplex *, | |||
| doublecomplex *, int *); | |||
| static int kstep; | |||
| extern /* Subroutine */ int zgemv_(char *, int *, int *, | |||
| doublecomplex *, doublecomplex *, int *, doublecomplex *, | |||
| int *, doublecomplex *, doublecomplex *, int *, ftnlen), | |||
| zcopy_(int *, doublecomplex *, int *, doublecomplex *, | |||
| int *), zswap_(int *, doublecomplex *, int *, | |||
| doublecomplex *, int *); | |||
| static double absakk, colmax; | |||
| extern int izamax_(int *, doublecomplex *, int *); | |||
| static double rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.) + 1.) / 8.; | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| kstep = 1; | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw * | |||
| w_dim1]), abs(d__2)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
| kw * w_dim1]), abs(d__2)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| i__1 = k - imax; | |||
| jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
| &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| jmax + (kw - 1) * w_dim1]), abs(d__2)); | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( | |||
| d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs( | |||
| d__2)); | |||
| rowmax = max(d__3,d__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| imax + (kw - 1) * w_dim1]), abs(d__2)) >= alpha * | |||
| rowmax) { | |||
| kp = imax; | |||
| zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = kk - 1 - kp; | |||
| zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| z_div(&z__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = z__1.r, r1.i = z__1.i; | |||
| i__1 = k - 1; | |||
| zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| z_div(&z__1, &w[k + kw * w_dim1], &d21); | |||
| d11.r = z__1.r, d11.i = z__1.i; | |||
| z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
| d22.r = z__1.r, d22.i = z__1.i; | |||
| z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; | |||
| z_div(&z__1, &c_b1, &z__2); | |||
| t.r = z__1.r, t.i = z__1.i; | |||
| z_div(&z__1, &t, &d21); | |||
| d21.r = z__1.r, d21.i = z__1.i; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
| .i; | |||
| z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
| d21.r * z__2.i + d21.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + kw * w_dim1; | |||
| z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
| .i; | |||
| z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
| d21.r * z__2.i + d21.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| ++j; | |||
| } | |||
| ++j; | |||
| if (jp != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
| } | |||
| if (j < *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k | |||
| + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
| kstep = 1; | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k * | |||
| w_dim1]), abs(d__2)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
| k * w_dim1]), abs(d__2)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - k; | |||
| zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], | |||
| lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
| w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
| ; | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| jmax + (k + 1) * w_dim1]), abs(d__2)); | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( | |||
| d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs( | |||
| d__2)); | |||
| rowmax = max(d__3,d__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| imax + (k + 1) * w_dim1]), abs(d__2)) >= alpha * | |||
| rowmax) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
| k * w_dim1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = kp - kk - 1; | |||
| zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| z_div(&z__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = z__1.r, r1.i = z__1.i; | |||
| i__1 = *n - k; | |||
| zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = z__1.r, d11.i = z__1.i; | |||
| z_div(&z__1, &w[k + k * w_dim1], &d21); | |||
| d22.r = z__1.r, d22.i = z__1.i; | |||
| z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; | |||
| z_div(&z__1, &c_b1, &z__2); | |||
| t.r = z__1.r, t.i = z__1.i; | |||
| z_div(&z__1, &t, &d21); | |||
| d21.r = z__1.r, d21.i = z__1.i; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + k * w_dim1; | |||
| z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
| .i; | |||
| z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
| d21.r * z__2.i + d21.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] | |||
| .i; | |||
| z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = | |||
| d21.r * z__2.i + d21.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| --j; | |||
| } | |||
| --j; | |||
| if (jp != jj && j >= 1) { | |||
| zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j > 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *, | |||
| double *, const int *, int *, double *, const int *, int *); | |||
| /** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's zsytrf_rook. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html | |||
| * */ | |||
| void RELAPACK_zsytrf_rook( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| double *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(double)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** zsytrf_rook's recursive compute kernel */ | |||
| static void RELAPACK_zsytrf_rook_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| double *A, const int *ldA, int *ipiv, | |||
| double *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_zsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = ZREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| double *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| double *const A_BL_B = A + 2 * *n; | |||
| double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| double *const Work_BL = Work + 2 * n1; | |||
| double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| double *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| double *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = ZREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| double *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| double *const Work_L = Work; | |||
| double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,561 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static doublecomplex c_b1 = {1.,0.}; | |||
| static int c__1 = 1; | |||
| /** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's zlasyf_rook. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n, | |||
| int *nb, int *kb, doublecomplex *a, int *lda, int * | |||
| ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| double d__1, d__2; | |||
| doublecomplex z__1, z__2, z__3, z__4; | |||
| /* Builtin functions */ | |||
| double sqrt(double), d_imag(doublecomplex *); | |||
| void z_div(doublecomplex *, doublecomplex *, doublecomplex *); | |||
| /* Local variables */ | |||
| static int j, k, p; | |||
| static doublecomplex t, r1, d11, d12, d21, d22; | |||
| static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
| static logical done; | |||
| static int imax, jmax; | |||
| static double alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| static double dtemp, sfmin; | |||
| extern /* Subroutine */ int zscal_(int *, doublecomplex *, | |||
| doublecomplex *, int *); | |||
| static int itemp, kstep; | |||
| extern /* Subroutine */ int zgemv_(char *, int *, int *, | |||
| doublecomplex *, doublecomplex *, int *, doublecomplex *, | |||
| int *, doublecomplex *, doublecomplex *, int *, ftnlen), | |||
| zcopy_(int *, doublecomplex *, int *, doublecomplex *, | |||
| int *), zswap_(int *, doublecomplex *, int *, | |||
| doublecomplex *, int *); | |||
| extern double dlamch_(char *, ftnlen); | |||
| static double absakk, colmax; | |||
| extern int izamax_(int *, doublecomplex *, int *); | |||
| static double rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.) + 1.) / 8.; | |||
| sfmin = dlamch_("S", (ftnlen)1); | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw * | |||
| w_dim1]), abs(d__2)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
| kw * w_dim1]), abs(d__2)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L12: | |||
| zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = k - imax; | |||
| jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& | |||
| w[jmax + (kw - 1) * w_dim1]), abs(d__2)); | |||
| } else { | |||
| rowmax = 0.; | |||
| } | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| i__1 = itemp + (kw - 1) * w_dim1; | |||
| dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| itemp + (kw - 1) * w_dim1]), abs(d__2)); | |||
| if (dtemp > rowmax) { | |||
| rowmax = dtemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax | |||
| + (kw - 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) { | |||
| kp = imax; | |||
| zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L12; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = k - p; | |||
| zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
| a_dim1], lda); | |||
| zcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - k + 1; | |||
| zswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
| ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + k * a_dim1; | |||
| i__2 = kk + k * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = k - 1 - kp; | |||
| zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| zcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - kk + 1; | |||
| zswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| i__1 = k + k * a_dim1; | |||
| if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k + | |||
| k * a_dim1]), abs(d__2)) >= sfmin) { | |||
| z_div(&z__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = z__1.r, r1.i = z__1.i; | |||
| i__1 = k - 1; | |||
| zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = k + k * a_dim1; | |||
| if (a[i__1].r != 0. || a[i__1].i != 0.) { | |||
| i__1 = k - 1; | |||
| for (ii = 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| z_div(&z__1, &a[ii + k * a_dim1], &a[k + k * | |||
| a_dim1]); | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L14: */ | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d12.r = w[i__1].r, d12.i = w[i__1].i; | |||
| z_div(&z__1, &w[k + kw * w_dim1], &d12); | |||
| d11.r = z__1.r, d11.i = z__1.i; | |||
| z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d12); | |||
| d22.r = z__1.r, d22.i = z__1.i; | |||
| z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; | |||
| z_div(&z__1, &c_b1, &z__2); | |||
| t.r = z__1.r, t.i = z__1.i; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| z_div(&z__2, &z__3, &d12); | |||
| z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * | |||
| z__2.i + t.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + kw * w_dim1; | |||
| z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| z_div(&z__2, &z__3, &d12); | |||
| z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * | |||
| z__2.i + t.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| ++j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| ++j; | |||
| if (jp2 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| jj = j - 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| i__1 = *n - j + 1; | |||
| zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| if (j <= *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, & | |||
| w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k * | |||
| w_dim1]), abs(d__2)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + | |||
| k * w_dim1]), abs(d__2)); | |||
| } else { | |||
| colmax = 0.; | |||
| } | |||
| if (max(absakk,colmax) == 0.) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L72: | |||
| i__1 = imax - k; | |||
| zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| z__1.r = -1., z__1.i = -0.; | |||
| zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1] | |||
| , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
| 1) * w_dim1], &c__1, (ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
| c__1); | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& | |||
| w[jmax + (k + 1) * w_dim1]), abs(d__2)); | |||
| } else { | |||
| rowmax = 0.; | |||
| } | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = itemp + (k + 1) * w_dim1; | |||
| dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ | |||
| itemp + (k + 1) * w_dim1]), abs(d__2)); | |||
| if (dtemp > rowmax) { | |||
| rowmax = dtemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax | |||
| + (k + 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L72; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p - k; | |||
| zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - p + 1; | |||
| zcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & | |||
| c__1); | |||
| zswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
| zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + k * a_dim1; | |||
| i__2 = kk + k * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = kp - k - 1; | |||
| zcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) | |||
| * a_dim1], lda); | |||
| i__1 = *n - kp + 1; | |||
| zcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * | |||
| a_dim1], &c__1); | |||
| zswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| i__1 = k + k * a_dim1; | |||
| if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k + | |||
| k * a_dim1]), abs(d__2)) >= sfmin) { | |||
| z_div(&z__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = z__1.r, r1.i = z__1.i; | |||
| i__1 = *n - k; | |||
| zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = k + k * a_dim1; | |||
| if (a[i__1].r != 0. || a[i__1].i != 0.) { | |||
| i__1 = *n; | |||
| for (ii = k + 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| z_div(&z__1, &a[ii + k * a_dim1], &a[k + k * | |||
| a_dim1]); | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L74: */ | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = z__1.r, d11.i = z__1.i; | |||
| z_div(&z__1, &w[k + k * w_dim1], &d21); | |||
| d22.r = z__1.r, d22.i = z__1.i; | |||
| z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; | |||
| z_div(&z__1, &c_b1, &z__2); | |||
| t.r = z__1.r, t.i = z__1.i; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + k * w_dim1; | |||
| z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| z_div(&z__2, &z__3, &d21); | |||
| z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * | |||
| z__2.i + t.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] | |||
| .i; | |||
| z_div(&z__2, &z__3, &d21); | |||
| z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * | |||
| z__2.i + t.i * z__2.r; | |||
| a[i__2].r = z__1.r, a[i__2].i = z__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| --j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| --j; | |||
| if (jp2 != jj && j >= 1) { | |||
| zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| jj = j + 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j >= 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,268 @@ | |||
| #include "relapack.h" | |||
| #include <math.h> | |||
| static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *, | |||
| const int *, const double *, const int *, const double *, const int *, | |||
| double *, const int *, const double *, const int *, const double *, | |||
| const int *, double *, const int *, double *, double *, double *, int *); | |||
| /** ZTGSYL solves the generalized Sylvester equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ztgsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html | |||
| * */ | |||
| void RELAPACK_ztgsyl( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, | |||
| const double *D, const int *ldD, const double *E, const int *ldE, | |||
| double *F, const int *ldF, | |||
| double *scale, double *dif, | |||
| double *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| // Parse arguments | |||
| const int notran = LAPACK(lsame)(trans, "N"); | |||
| const int tran = LAPACK(lsame)(trans, "C"); | |||
| // Compute work buffer size | |||
| int lwmin = 1; | |||
| if (notran && (*ijob == 1 || *ijob == 2)) | |||
| lwmin = MAX(1, 2 * *m * *n); | |||
| *info = 0; | |||
| // Check arguments | |||
| if (!tran && !notran) | |||
| *info = -1; | |||
| else if (notran && (*ijob < 0 || *ijob > 4)) | |||
| *info = -2; | |||
| else if (*m <= 0) | |||
| *info = -3; | |||
| else if (*n <= 0) | |||
| *info = -4; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -6; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -8; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -10; | |||
| else if (*ldD < MAX(1, *m)) | |||
| *info = -12; | |||
| else if (*ldE < MAX(1, *n)) | |||
| *info = -14; | |||
| else if (*ldF < MAX(1, *m)) | |||
| *info = -16; | |||
| else if (*lWork < lwmin && *lWork != -1) | |||
| *info = -20; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZTGSYL", &minfo); | |||
| return; | |||
| } | |||
| if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = lwmin; | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantrans = notran ? 'N' : 'C'; | |||
| // Constant | |||
| const double ZERO[] = { 0., 0. }; | |||
| int isolve = 1; | |||
| int ifunc = 0; | |||
| if (notran) { | |||
| if (*ijob >= 3) { | |||
| ifunc = *ijob - 2; | |||
| LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else if (*ijob >= 1) | |||
| isolve = 2; | |||
| } | |||
| double scale2; | |||
| int iround; | |||
| for (iround = 1; iround <= isolve; iround++) { | |||
| *scale = 1; | |||
| double dscale = 0; | |||
| double dsum = 1; | |||
| RELAPACK_ztgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info); | |||
| if (dscale != 0) { | |||
| if (*ijob == 1 || *ijob == 3) | |||
| *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); | |||
| else | |||
| *dif = sqrt(*m * *n) / (dscale * sqrt(dsum)); | |||
| } | |||
| if (isolve == 2) { | |||
| if (iround == 1) { | |||
| if (notran) | |||
| ifunc = *ijob; | |||
| scale2 = *scale; | |||
| LAPACK(zlacpy)("F", m, n, C, ldC, Work, m); | |||
| LAPACK(zlacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m); | |||
| LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else { | |||
| LAPACK(zlacpy)("F", m, n, Work, m, C, ldC); | |||
| LAPACK(zlacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF); | |||
| *scale = scale2; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| /** ztgsyl's recursive vompute kernel */ | |||
| static void RELAPACK_ztgsyl_rec( | |||
| const char *trans, const int *ifunc, const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, | |||
| const double *D, const int *ldD, const double *E, const int *ldE, | |||
| double *F, const int *ldF, | |||
| double *scale, double *dsum, double *dscale, | |||
| int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) { | |||
| // Unblocked | |||
| LAPACK(ztgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| double scale1[] = { 1., 0. }; | |||
| double scale2[] = { 1., 0. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| const int m1 = ZREC_SPLIT(*m); | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const double *const A_TL = A; | |||
| const double *const A_TR = A + 2 * *ldA * m1; | |||
| const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
| // C_T | |||
| // C_B | |||
| double *const C_T = C; | |||
| double *const C_B = C + 2 * m1; | |||
| // D_TL D_TR | |||
| // 0 D_BR | |||
| const double *const D_TL = D; | |||
| const double *const D_TR = D + 2 * *ldD * m1; | |||
| const double *const D_BR = D + 2 * *ldD * m1 + 2 * m1; | |||
| // F_T | |||
| // F_B | |||
| double *const F_T = F; | |||
| double *const F_B = F + 2 * m1; | |||
| if (*trans == 'N') { | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // F_T = F_T - D_TR * C_B | |||
| BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); | |||
| // C_B = C_B - A_TR^H * C_T | |||
| BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // C_B = C_B - D_TR^H * F_T | |||
| BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); | |||
| } | |||
| } | |||
| } else { | |||
| // Splitting | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const double *const B_TL = B; | |||
| const double *const B_TR = B + 2 * *ldB * n1; | |||
| const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
| // C_L C_R | |||
| double *const C_L = C; | |||
| double *const C_R = C + 2 * *ldC * n1; | |||
| // E_TL E_TR | |||
| // 0 E_BR | |||
| const double *const E_TL = E; | |||
| const double *const E_TR = E + 2 * *ldE * n1; | |||
| const double *const E_BR = E + 2 * *ldE * n1 + 2 * n1; | |||
| // F_L F_R | |||
| double *const F_L = F; | |||
| double *const F_R = F + 2 * *ldF * n1; | |||
| if (*trans == 'N') { | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1); | |||
| // C_R = C_R + F_L * B_TR | |||
| BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); | |||
| // F_R = F_R + F_L * E_TR | |||
| BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); | |||
| // F_L = F_L + C_R * B_TR | |||
| BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); | |||
| // F_L = F_L + F_R * E_TR | |||
| BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); | |||
| } | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,163 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *, | |||
| const int *, const int *, const double *, const int *, const double *, | |||
| const int *, double *, const int *, double *, int *); | |||
| /** ZTRSYL solves the complex Sylvester matrix equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ztrsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html | |||
| * */ | |||
| void RELAPACK_ztrsyl( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, double *scale, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int notransA = LAPACK(lsame)(tranA, "N"); | |||
| const int ctransA = LAPACK(lsame)(tranA, "C"); | |||
| const int notransB = LAPACK(lsame)(tranB, "N"); | |||
| const int ctransB = LAPACK(lsame)(tranB, "C"); | |||
| *info = 0; | |||
| if (!ctransA && !notransA) | |||
| *info = -1; | |||
| else if (!ctransB && !notransB) | |||
| *info = -2; | |||
| else if (*isgn != 1 && *isgn != -1) | |||
| *info = -3; | |||
| else if (*m < 0) | |||
| *info = -4; | |||
| else if (*n < 0) | |||
| *info = -5; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -7; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -9; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -11; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZTRSYL", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantranA = notransA ? 'N' : 'C'; | |||
| const char cleantranB = notransB ? 'N' : 'C'; | |||
| // Recursive kernel | |||
| RELAPACK_ztrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| /** ztrsyl's recursive compute kernel */ | |||
| static void RELAPACK_ztrsyl_rec( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const double *A, const int *ldA, const double *B, const int *ldB, | |||
| double *C, const int *ldC, double *scale, | |||
| int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) { | |||
| // Unblocked | |||
| RELAPACK_ztrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1., 0. }; | |||
| const double MONE[] = { -1., 0. }; | |||
| const double MSGN[] = { -*isgn, 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| double scale1[] = { 1., 0. }; | |||
| double scale2[] = { 1., 0. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| const int m1 = ZREC_SPLIT(*m); | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const double *const A_TL = A; | |||
| const double *const A_TR = A + 2 * *ldA * m1; | |||
| const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
| // C_T | |||
| // C_B | |||
| double *const C_T = C; | |||
| double *const C_B = C + 2 * m1; | |||
| if (*tranA == 'N') { | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| } else { | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); | |||
| // C_B = C_B - A_TR' * C_T | |||
| BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); | |||
| } | |||
| } else { | |||
| // Splitting | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const double *const B_TL = B; | |||
| const double *const B_TR = B + 2 * *ldB * n1; | |||
| const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
| // C_L C_R | |||
| double *const C_L = C; | |||
| double *const C_R = C + 2 * *ldC * n1; | |||
| if (*tranB == 'N') { | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); | |||
| // C_R = C_R -/+ C_L * B_TR | |||
| BLAS(zgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| } else { | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); | |||
| // C_L = C_L -/+ C_R * B_TR' | |||
| BLAS(zgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,394 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "../config.h" | |||
| #include "f2c.h" | |||
| #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { | |||
| extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); | |||
| doublecomplex result; | |||
| zdotu_(&result, n, x, incx, y, incy); | |||
| return result; | |||
| } | |||
| #define zdotu_ zdotu_fun | |||
| doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { | |||
| extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); | |||
| doublecomplex result; | |||
| zdotc_(&result, n, x, incx, y, incy); | |||
| return result; | |||
| } | |||
| #define zdotc_ zdotc_fun | |||
| #endif | |||
| #if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) { | |||
| extern void zladiv_(doublecomplex *, doublecomplex *, doublecomplex *); | |||
| doublecomplex result; | |||
| zladiv_(&result, a, b); | |||
| return result; | |||
| } | |||
| #define zladiv_ zladiv_fun | |||
| #endif | |||
| /* Table of constant values */ | |||
| static int c__1 = 1; | |||
| /** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) | |||
| * | |||
| * This routine is an exact copy of LAPACK's ztrsyl. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int | |||
| *isgn, int *m, int *n, doublecomplex *a, int *lda, | |||
| doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc, | |||
| double *scale, int *info, ftnlen trana_len, ftnlen tranb_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, | |||
| i__3, i__4; | |||
| double d__1, d__2; | |||
| doublecomplex z__1, z__2, z__3, z__4; | |||
| /* Builtin functions */ | |||
| double d_imag(doublecomplex *); | |||
| void d_cnjg(doublecomplex *, doublecomplex *); | |||
| /* Local variables */ | |||
| static int j, k, l; | |||
| static doublecomplex a11; | |||
| static double db; | |||
| static doublecomplex x11; | |||
| static double da11; | |||
| static doublecomplex vec; | |||
| static double dum[1], eps, sgn, smin; | |||
| static doublecomplex suml, sumr; | |||
| extern int lsame_(char *, char *, ftnlen, ftnlen); | |||
| /* Double Complex */ doublecomplex zdotc_(int *, | |||
| doublecomplex *, int *, doublecomplex *, int *), zdotu_( | |||
| int *, doublecomplex *, int *, | |||
| doublecomplex *, int *); | |||
| extern /* Subroutine */ int dlabad_(double *, double *); | |||
| extern double dlamch_(char *, ftnlen); | |||
| static double scaloc; | |||
| extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); | |||
| extern double zlange_(char *, int *, int *, doublecomplex *, | |||
| int *, double *, ftnlen); | |||
| static double bignum; | |||
| extern /* Subroutine */ int zdscal_(int *, double *, | |||
| doublecomplex *, int *); | |||
| /* Double Complex */ doublecomplex zladiv_(doublecomplex *, | |||
| doublecomplex *); | |||
| static int notrna, notrnb; | |||
| static double smlnum; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| b_dim1 = *ldb; | |||
| b_offset = 1 + b_dim1; | |||
| b -= b_offset; | |||
| c_dim1 = *ldc; | |||
| c_offset = 1 + c_dim1; | |||
| c__ -= c_offset; | |||
| /* Function Body */ | |||
| notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); | |||
| notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); | |||
| *info = 0; | |||
| if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) { | |||
| *info = -1; | |||
| } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { | |||
| *info = -2; | |||
| } else if (*isgn != 1 && *isgn != -1) { | |||
| *info = -3; | |||
| } else if (*m < 0) { | |||
| *info = -4; | |||
| } else if (*n < 0) { | |||
| *info = -5; | |||
| } else if (*lda < max(1,*m)) { | |||
| *info = -7; | |||
| } else if (*ldb < max(1,*n)) { | |||
| *info = -9; | |||
| } else if (*ldc < max(1,*m)) { | |||
| *info = -11; | |||
| } | |||
| if (*info != 0) { | |||
| i__1 = -(*info); | |||
| xerbla_("ZTRSY2", &i__1, (ftnlen)6); | |||
| return; | |||
| } | |||
| *scale = 1.; | |||
| if (*m == 0 || *n == 0) { | |||
| return; | |||
| } | |||
| eps = dlamch_("P", (ftnlen)1); | |||
| smlnum = dlamch_("S", (ftnlen)1); | |||
| bignum = 1. / smlnum; | |||
| dlabad_(&smlnum, &bignum); | |||
| smlnum = smlnum * (double) (*m * *n) / eps; | |||
| bignum = 1. / smlnum; | |||
| /* Computing MAX */ | |||
| d__1 = smlnum, d__2 = eps * zlange_("M", m, m, &a[a_offset], lda, dum, ( | |||
| ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * zlange_("M", n, n, | |||
| &b[b_offset], ldb, dum, (ftnlen)1); | |||
| smin = max(d__1,d__2); | |||
| sgn = (double) (*isgn); | |||
| if (notrna && notrnb) { | |||
| i__1 = *n; | |||
| for (l = 1; l <= i__1; ++l) { | |||
| for (k = *m; k >= 1; --k) { | |||
| i__2 = *m - k; | |||
| /* Computing MIN */ | |||
| i__3 = k + 1; | |||
| /* Computing MIN */ | |||
| i__4 = k + 1; | |||
| z__1 = zdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[ | |||
| min(i__4,*m) + l * c_dim1], &c__1); | |||
| suml.r = z__1.r, suml.i = z__1.i; | |||
| i__2 = l - 1; | |||
| z__1 = zdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
| , &c__1); | |||
| sumr.r = z__1.r, sumr.i = z__1.i; | |||
| i__2 = k + l * c_dim1; | |||
| z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i; | |||
| z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; | |||
| z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i; | |||
| vec.r = z__1.r, vec.i = z__1.i; | |||
| scaloc = 1.; | |||
| i__2 = k + k * a_dim1; | |||
| i__3 = l + l * b_dim1; | |||
| z__2.r = sgn * b[i__3].r, z__2.i = sgn * b[i__3].i; | |||
| z__1.r = a[i__2].r + z__2.r, z__1.i = a[i__2].i + z__2.i; | |||
| a11.r = z__1.r, a11.i = z__1.i; | |||
| da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( | |||
| d__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( | |||
| d__2)); | |||
| if (da11 < 1. && db > 1.) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1. / db; | |||
| } | |||
| } | |||
| z__3.r = scaloc, z__3.i = 0.; | |||
| z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * | |||
| z__3.i + vec.i * z__3.r; | |||
| z__1 = zladiv_(&z__2, &a11); | |||
| x11.r = z__1.r, x11.i = z__1.i; | |||
| if (scaloc != 1.) { | |||
| i__2 = *n; | |||
| for (j = 1; j <= i__2; ++j) { | |||
| zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L10: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__2 = k + l * c_dim1; | |||
| c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
| /* L20: */ | |||
| } | |||
| /* L30: */ | |||
| } | |||
| } else if (! notrna && notrnb) { | |||
| i__1 = *n; | |||
| for (l = 1; l <= i__1; ++l) { | |||
| i__2 = *m; | |||
| for (k = 1; k <= i__2; ++k) { | |||
| i__3 = k - 1; | |||
| z__1 = zdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
| c_dim1 + 1], &c__1); | |||
| suml.r = z__1.r, suml.i = z__1.i; | |||
| i__3 = l - 1; | |||
| z__1 = zdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
| , &c__1); | |||
| sumr.r = z__1.r, sumr.i = z__1.i; | |||
| i__3 = k + l * c_dim1; | |||
| z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i; | |||
| z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; | |||
| z__1.r = c__[i__3].r - z__2.r, z__1.i = c__[i__3].i - z__2.i; | |||
| vec.r = z__1.r, vec.i = z__1.i; | |||
| scaloc = 1.; | |||
| d_cnjg(&z__2, &a[k + k * a_dim1]); | |||
| i__3 = l + l * b_dim1; | |||
| z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i; | |||
| z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; | |||
| a11.r = z__1.r, a11.i = z__1.i; | |||
| da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( | |||
| d__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( | |||
| d__2)); | |||
| if (da11 < 1. && db > 1.) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1. / db; | |||
| } | |||
| } | |||
| z__3.r = scaloc, z__3.i = 0.; | |||
| z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * | |||
| z__3.i + vec.i * z__3.r; | |||
| z__1 = zladiv_(&z__2, &a11); | |||
| x11.r = z__1.r, x11.i = z__1.i; | |||
| if (scaloc != 1.) { | |||
| i__3 = *n; | |||
| for (j = 1; j <= i__3; ++j) { | |||
| zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L40: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__3 = k + l * c_dim1; | |||
| c__[i__3].r = x11.r, c__[i__3].i = x11.i; | |||
| /* L50: */ | |||
| } | |||
| /* L60: */ | |||
| } | |||
| } else if (! notrna && ! notrnb) { | |||
| for (l = *n; l >= 1; --l) { | |||
| i__1 = *m; | |||
| for (k = 1; k <= i__1; ++k) { | |||
| i__2 = k - 1; | |||
| z__1 = zdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
| c_dim1 + 1], &c__1); | |||
| suml.r = z__1.r, suml.i = z__1.i; | |||
| i__2 = *n - l; | |||
| /* Computing MIN */ | |||
| i__3 = l + 1; | |||
| /* Computing MIN */ | |||
| i__4 = l + 1; | |||
| z__1 = zdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[ | |||
| l + min(i__4,*n) * b_dim1], ldb); | |||
| sumr.r = z__1.r, sumr.i = z__1.i; | |||
| i__2 = k + l * c_dim1; | |||
| d_cnjg(&z__4, &sumr); | |||
| z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i; | |||
| z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; | |||
| z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i; | |||
| vec.r = z__1.r, vec.i = z__1.i; | |||
| scaloc = 1.; | |||
| i__2 = k + k * a_dim1; | |||
| i__3 = l + l * b_dim1; | |||
| z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i; | |||
| z__2.r = a[i__2].r + z__3.r, z__2.i = a[i__2].i + z__3.i; | |||
| d_cnjg(&z__1, &z__2); | |||
| a11.r = z__1.r, a11.i = z__1.i; | |||
| da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( | |||
| d__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( | |||
| d__2)); | |||
| if (da11 < 1. && db > 1.) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1. / db; | |||
| } | |||
| } | |||
| z__3.r = scaloc, z__3.i = 0.; | |||
| z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * | |||
| z__3.i + vec.i * z__3.r; | |||
| z__1 = zladiv_(&z__2, &a11); | |||
| x11.r = z__1.r, x11.i = z__1.i; | |||
| if (scaloc != 1.) { | |||
| i__2 = *n; | |||
| for (j = 1; j <= i__2; ++j) { | |||
| zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L70: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__2 = k + l * c_dim1; | |||
| c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
| /* L80: */ | |||
| } | |||
| /* L90: */ | |||
| } | |||
| } else if (notrna && ! notrnb) { | |||
| for (l = *n; l >= 1; --l) { | |||
| for (k = *m; k >= 1; --k) { | |||
| i__1 = *m - k; | |||
| /* Computing MIN */ | |||
| i__2 = k + 1; | |||
| /* Computing MIN */ | |||
| i__3 = k + 1; | |||
| z__1 = zdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[ | |||
| min(i__3,*m) + l * c_dim1], &c__1); | |||
| suml.r = z__1.r, suml.i = z__1.i; | |||
| i__1 = *n - l; | |||
| /* Computing MIN */ | |||
| i__2 = l + 1; | |||
| /* Computing MIN */ | |||
| i__3 = l + 1; | |||
| z__1 = zdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[ | |||
| l + min(i__3,*n) * b_dim1], ldb); | |||
| sumr.r = z__1.r, sumr.i = z__1.i; | |||
| i__1 = k + l * c_dim1; | |||
| d_cnjg(&z__4, &sumr); | |||
| z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i; | |||
| z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; | |||
| z__1.r = c__[i__1].r - z__2.r, z__1.i = c__[i__1].i - z__2.i; | |||
| vec.r = z__1.r, vec.i = z__1.i; | |||
| scaloc = 1.; | |||
| i__1 = k + k * a_dim1; | |||
| d_cnjg(&z__3, &b[l + l * b_dim1]); | |||
| z__2.r = sgn * z__3.r, z__2.i = sgn * z__3.i; | |||
| z__1.r = a[i__1].r + z__2.r, z__1.i = a[i__1].i + z__2.i; | |||
| a11.r = z__1.r, a11.i = z__1.i; | |||
| da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( | |||
| d__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( | |||
| d__2)); | |||
| if (da11 < 1. && db > 1.) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1. / db; | |||
| } | |||
| } | |||
| z__3.r = scaloc, z__3.i = 0.; | |||
| z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * | |||
| z__3.i + vec.i * z__3.r; | |||
| z__1 = zladiv_(&z__2, &a11); | |||
| x11.r = z__1.r, x11.i = z__1.i; | |||
| if (scaloc != 1.) { | |||
| i__1 = *n; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L100: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__1 = k + l * c_dim1; | |||
| c__[i__1].r = x11.r, c__[i__1].i = x11.i; | |||
| /* L110: */ | |||
| } | |||
| /* L120: */ | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,107 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_ztrtri_rec(const char *, const char *, const int *, | |||
| double *, const int *, int *); | |||
| /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ztrtri. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html | |||
| * */ | |||
| void RELAPACK_ztrtri( | |||
| const char *uplo, const char *diag, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int nounit = LAPACK(lsame)(diag, "N"); | |||
| const int unit = LAPACK(lsame)(diag, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (!nounit && !unit) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("ZTRTRI", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleandiag = nounit ? 'N' : 'U'; | |||
| // check for singularity | |||
| if (nounit) { | |||
| int i; | |||
| for (i = 0; i < *n; i++) | |||
| if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { | |||
| *info = i; | |||
| return; | |||
| } | |||
| } | |||
| // Recursive kernel | |||
| RELAPACK_ztrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); | |||
| } | |||
| /** ztrtri's recursive compute kernel */ | |||
| static void RELAPACK_ztrtri_rec( | |||
| const char *uplo, const char *diag, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) { | |||
| // Unblocked | |||
| LAPACK(ztrti2)(uplo, diag, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| // Splitting | |||
| const int n1 = ZREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + 2 * *ldA * n1; | |||
| double *const A_BL = A + 2 * n1; | |||
| double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_ztrtri_rec(uplo, diag, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = - A_BL * A_TL | |||
| BLAS(ztrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BL = A_BR \ A_BL | |||
| BLAS(ztrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TR = - A_TL * A_TR | |||
| BLAS(ztrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); | |||
| // A_TR = A_TR / A_BR | |||
| BLAS(ztrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_ztrtri_rec(uplo, diag, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||