Add Elmar Peise's ReLAPACK

8 years ago · 9b7b5f7fdc
--- a/relapack/LICENSE
+++ b/relapack/LICENSE
@@ -0,0 +1,22 @@
 The MIT License (MIT)

 Copyright (c) 2016 Elmar Peise

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.

--- a/relapack/Makefile
+++ b/relapack/Makefile
@@ -0,0 +1,64 @@
 TOPDIR  = ..
 include $(TOPDIR)/Makefile.system



 SRC = $(wildcard src/*.c)
 OBJS = $(SRC:%.c=%.o)

 TEST_SUITS = \
 	slauum dlauum clauum zlauum \
 	spotrf dpotrf cpotrf zpotrf \
 	spbtrf dpbtrf cpbtrf zpbtrf \
 	ssygst dsygst chegst zhegst \
 	ssytrf dsytrf csytrf chetrf zsytrf zhetrf \
 	sgetrf dgetrf cgetrf zgetrf \
 	sgbtrf dgbtrf cgbtrf zgbtrf \
 	strsyl dtrsyl ctrsyl ztrsyl \
 	stgsyl dtgsyl ctgsyl ztgsyl \
 	sgemmt dgemmt cgemmt zgemmt
 TESTS = $(TEST_SUITS:%=test/%.pass)  # dummies
 TEST_EXES = $(TEST_SUITS:%=test/%.x)

 LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm

 .SECONDARY: $(TEST_EXES)
 .PHONY: test

 # ReLAPACK compilation

 libs:	$(OBJS)
 	@echo "Building ReLAPACK library $(LIBNAME)"
 	$(AR) -r  $(TOPDIR)/$(LIBNAME) $(OBJS)
 	$(RANLIB) $(TOPDIR)/$(LIBNAME)

 %.o: %.c config.h
 	$(CC) $(CFLAGS) -c $< -o $@


 # ReLAPACK testing

 test: $(TEST_EXES) $(TESTS)
 	@echo "passed all tests"

 test/%.pass: test/%.x
 	@echo -n $*:
 	@./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<)

 test/s%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
 	$(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)

 test/d%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
 	$(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)

 test/c%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
 	$(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)

 test/z%.x: test/x%.c test/util.o $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
 	$(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.o -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)


 # cleaning up

 clean:
 	rm -f  $(OBJS) test/util.o test/*.x
--- a/relapack/README.md
+++ b/relapack/README.md
@@ -0,0 +1,68 @@
 ReLAPACK
 ========

 [![Build Status](https://travis-ci.org/HPAC/ReLAPACK.svg?branch=master)](https://travis-ci.org/HPAC/ReLAPACK)

 [Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK)

 ReLAPACK offers a collection of recursive algorithms for many of LAPACK's
 compute kernels.  Since it preserves LAPACK's established interfaces, ReLAPACK
 integrates effortlessly into existing application codes.  ReLAPACK's routines
 not only outperform the reference LAPACK but also improve upon the performance
 of tuned implementations, such as OpenBLAS and MKL.


 Coverage
 --------
 For a detailed list of covered operations and an overview of operations to which
 recursion is not efficiently applicable, see [coverage.md](coverage.md).


 Installation
 ------------
 To compile with the default configuration, simply run `make` to create the
 library `librelapack.a`.

 ### Linking with MKL
 Note that to link with MKL, you currently need to set the flag
 `COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and
 `ztrsyl`.  For further configuration options see [config.md](config.md).


 ### Dependencies
 ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked
 kernels from [LAPACK](http://www.netlib.org/lapack/).  There are many optimized
 and machine specific implementations of these libraries, which are commonly
 provided by hardware vendors or available as open source (e.g.,
 [OpenBLAS](http://www.openblas.net/)).


 Testing
 -------
 ReLAPACK's test suite compares its routines numerically with LAPACK's
 counterparts.  To set up the tests (located int `test/`) you need to specify
 link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then
 `make test` runs the tests.  For details on the performed tests, see
 [test/README.md](test/README.md).


 Examples
 --------
 Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the
 covered routines applies directly to ReLAPACK.  A few separate examples are
 given in `examples/`. For details, see [examples/README.md](examples/README.md).


 Citing
 ------
 When referencing ReLAPACK, please cite the preprint of the paper
 [Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763):

    @article{relapack,
      author    = {Elmar Peise and Paolo Bientinesi},
      title     = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection},
      journal   = {CoRR},
      volume    = {abs/1602.06763},
      year      = {2016},
      url       = {http://arxiv.org/abs/1602.06763},
    }
--- a/relapack/config.h
+++ b/relapack/config.h
@@ -0,0 +1,208 @@
 #ifndef RELAPACK_CONFIG_H
 #define RELAPACK_CONFIG_H

 // ReLAPACK configuration file.
 // See also config.md


 ///////////////////////////////
 // BLAS/LAPACK obect symbols //
 ///////////////////////////////

 // BLAS routines linked against have a trailing underscore
 #define BLAS_UNDERSCORE 1
 // LAPACK routines linked against have a trailing underscore
 #define LAPACK_UNDERSCORE BLAS_UNDERSCORE

 // Complex BLAS/LAPACK routines return their result in the first argument
 // This option must be enabled when linking to MKL for ctrsyl and ztrsyl to
 // work.
 #define COMPLEX_FUNCTIONS_AS_ROUTINES 0
 #ifdef F_INTERFACE_INTEL
 #define COMPLEX_FUNCTIONS_AS_ROUTINES 1
 #endif
 #define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
 #define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES

 // The BLAS-like extension xgemmt is provided by an external library.
 #define HAVE_XGEMMT 0


 ////////////////////////////
 // Use malloc in ReLAPACK //
 ////////////////////////////

 #define ALLOW_MALLOC 1
 // allow malloc in xsygst for improved performance
 #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
 // allow malloc in xsytrf if the passed work buffer is too small
 #define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC


 ////////////////////////////////
 // LAPACK routine replacement //
 ////////////////////////////////
 // The following macros specify which routines are included in the library under
 // LAPACK's symbol names: 1 included, 0 not included

 #define INCLUDE_ALL 1

 #define INCLUDE_XLAUUM INCLUDE_ALL
 #define INCLUDE_SLAUUM INCLUDE_XLAUUM
 #define INCLUDE_DLAUUM INCLUDE_XLAUUM
 #define INCLUDE_CLAUUM INCLUDE_XLAUUM
 #define INCLUDE_ZLAUUM INCLUDE_XLAUUM

 #define INCLUDE_XSYGST INCLUDE_ALL
 #define INCLUDE_SSYGST INCLUDE_XSYGST
 #define INCLUDE_DSYGST INCLUDE_XSYGST
 #define INCLUDE_CHEGST INCLUDE_XSYGST
 #define INCLUDE_ZHEGST INCLUDE_XSYGST

 #define INCLUDE_XTRTRI INCLUDE_ALL
 #define INCLUDE_STRTRI INCLUDE_XTRTRI
 #define INCLUDE_DTRTRI INCLUDE_XTRTRI
 #define INCLUDE_CTRTRI INCLUDE_XTRTRI
 #define INCLUDE_ZTRTRI INCLUDE_XTRTRI

 #define INCLUDE_XPOTRF INCLUDE_ALL
 #define INCLUDE_SPOTRF INCLUDE_XPOTRF
 #define INCLUDE_DPOTRF INCLUDE_XPOTRF
 #define INCLUDE_CPOTRF INCLUDE_XPOTRF
 #define INCLUDE_ZPOTRF INCLUDE_XPOTRF

 #define INCLUDE_XPBTRF INCLUDE_ALL
 #define INCLUDE_SPBTRF INCLUDE_XPBTRF
 #define INCLUDE_DPBTRF INCLUDE_XPBTRF
 #define INCLUDE_CPBTRF INCLUDE_XPBTRF
 #define INCLUDE_ZPBTRF INCLUDE_XPBTRF

 #define INCLUDE_XSYTRF INCLUDE_ALL
 #define INCLUDE_SSYTRF INCLUDE_XSYTRF
 #define INCLUDE_DSYTRF INCLUDE_XSYTRF
 #define INCLUDE_CSYTRF INCLUDE_XSYTRF
 #define INCLUDE_CHETRF INCLUDE_XSYTRF
 #define INCLUDE_ZSYTRF INCLUDE_XSYTRF
 #define INCLUDE_ZHETRF INCLUDE_XSYTRF
 #define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF
 #define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF
 #define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF
 #define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF
 #define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF
 #define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF

 #define INCLUDE_XGETRF INCLUDE_ALL
 #define INCLUDE_SGETRF INCLUDE_XGETRF
 #define INCLUDE_DGETRF INCLUDE_XGETRF
 #define INCLUDE_CGETRF INCLUDE_XGETRF
 #define INCLUDE_ZGETRF INCLUDE_XGETRF

 #define INCLUDE_XGBTRF INCLUDE_ALL
 #define INCLUDE_SGBTRF INCLUDE_XGBTRF
 #define INCLUDE_DGBTRF INCLUDE_XGBTRF
 #define INCLUDE_CGBTRF INCLUDE_XGBTRF
 #define INCLUDE_ZGBTRF INCLUDE_XGBTRF

 #define INCLUDE_XTRSYL INCLUDE_ALL
 #define INCLUDE_STRSYL INCLUDE_XTRSYL
 #define INCLUDE_DTRSYL INCLUDE_XTRSYL
 #define INCLUDE_CTRSYL INCLUDE_XTRSYL
 #define INCLUDE_ZTRSYL INCLUDE_XTRSYL

 #define INCLUDE_XTGSYL INCLUDE_ALL
 #define INCLUDE_STGSYL INCLUDE_XTGSYL
 #define INCLUDE_DTGSYL INCLUDE_XTGSYL
 #define INCLUDE_CTGSYL INCLUDE_XTGSYL
 #define INCLUDE_ZTGSYL INCLUDE_XTGSYL

 #define INCLUDE_XGEMMT 0
 #define INCLUDE_SGEMMT INCLUDE_XGEMMT
 #define INCLUDE_DGEMMT INCLUDE_XGEMMT
 #define INCLUDE_CGEMMT INCLUDE_XGEMMT
 #define INCLUDE_ZGEMMT INCLUDE_XGEMMT


 /////////////////////
 // crossover sizes //
 /////////////////////

 // default crossover size
 #define CROSSOVER 24

 // individual crossover sizes
 #define CROSSOVER_XLAUUM CROSSOVER
 #define CROSSOVER_SLAUUM CROSSOVER_XLAUUM
 #define CROSSOVER_DLAUUM CROSSOVER_XLAUUM
 #define CROSSOVER_CLAUUM CROSSOVER_XLAUUM
 #define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM

 #define CROSSOVER_XSYGST CROSSOVER
 #define CROSSOVER_SSYGST CROSSOVER_XSYGST
 #define CROSSOVER_DSYGST CROSSOVER_XSYGST
 #define CROSSOVER_CHEGST CROSSOVER_XSYGST
 #define CROSSOVER_ZHEGST CROSSOVER_XSYGST

 #define CROSSOVER_XTRTRI CROSSOVER
 #define CROSSOVER_STRTRI CROSSOVER_XTRTRI
 #define CROSSOVER_DTRTRI CROSSOVER_XTRTRI
 #define CROSSOVER_CTRTRI CROSSOVER_XTRTRI
 #define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI

 #define CROSSOVER_XPOTRF CROSSOVER
 #define CROSSOVER_SPOTRF CROSSOVER_XPOTRF
 #define CROSSOVER_DPOTRF CROSSOVER_XPOTRF
 #define CROSSOVER_CPOTRF CROSSOVER_XPOTRF
 #define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF

 #define CROSSOVER_XPBTRF CROSSOVER
 #define CROSSOVER_SPBTRF CROSSOVER_XPBTRF
 #define CROSSOVER_DPBTRF CROSSOVER_XPBTRF
 #define CROSSOVER_CPBTRF CROSSOVER_XPBTRF
 #define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF

 #define CROSSOVER_XSYTRF CROSSOVER
 #define CROSSOVER_SSYTRF CROSSOVER_XSYTRF
 #define CROSSOVER_DSYTRF CROSSOVER_XSYTRF
 #define CROSSOVER_CSYTRF CROSSOVER_XSYTRF
 #define CROSSOVER_CHETRF CROSSOVER_XSYTRF
 #define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF
 #define CROSSOVER_ZHETRF CROSSOVER_XSYTRF
 #define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF
 #define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF
 #define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF
 #define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF
 #define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF
 #define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF

 #define CROSSOVER_XGETRF CROSSOVER
 #define CROSSOVER_SGETRF CROSSOVER_XGETRF
 #define CROSSOVER_DGETRF CROSSOVER_XGETRF
 #define CROSSOVER_CGETRF CROSSOVER_XGETRF
 #define CROSSOVER_ZGETRF CROSSOVER_XGETRF

 #define CROSSOVER_XGBTRF CROSSOVER
 #define CROSSOVER_SGBTRF CROSSOVER_XGBTRF
 #define CROSSOVER_DGBTRF CROSSOVER_XGBTRF
 #define CROSSOVER_CGBTRF CROSSOVER_XGBTRF
 #define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF

 #define CROSSOVER_XTRSYL CROSSOVER
 #define CROSSOVER_STRSYL CROSSOVER_XTRSYL
 #define CROSSOVER_DTRSYL CROSSOVER_XTRSYL
 #define CROSSOVER_CTRSYL CROSSOVER_XTRSYL
 #define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL

 #define CROSSOVER_XTGSYL CROSSOVER
 #define CROSSOVER_STGSYL CROSSOVER_XTGSYL
 #define CROSSOVER_DTGSYL CROSSOVER_XTGSYL
 #define CROSSOVER_CTGSYL CROSSOVER_XTGSYL
 #define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL

 // sytrf helper routine
 #define CROSSOVER_XGEMMT CROSSOVER_XSYTRF
 #define CROSSOVER_SGEMMT CROSSOVER_XGEMMT
 #define CROSSOVER_DGEMMT CROSSOVER_XGEMMT
 #define CROSSOVER_CGEMMT CROSSOVER_XGEMMT
 #define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT

 #endif /* RELAPACK_CONFIG_H */
--- a/relapack/config.md
+++ b/relapack/config.md
@@ -0,0 +1,87 @@
 RELAPACK Configuration
 ======================

 ReLAPACK has two configuration files: `make.inc`, which is included by the
 Makefile, and `config.h` which is included in the source files.


 Build and Testing Environment
 -----------------------------
 The build environment (compiler and flags) and the test configuration (linker
 flags for BLAS and LAPACK) are specified in `make.inc`.  The test matrix size
 and error bounds are defined in `test/config.h`.

 The library `librelapack.a` is compiled by invoking `make`.  The tests are
 performed by either `make test` or calling `make` in the test folder.


 BLAS/LAPACK complex function interfaces
 ---------------------------------------
 For BLAS and LAPACK functions that return a complex number, there exist two
 conflicting (FORTRAN compiler dependent) calling conventions: either the result
 is returned as a `struct` of two floating point numbers or an additional first
 argument with a pointer to such a `struct` is used.  By default ReLAPACK uses
 the former (which is what gfortran uses), but it can switch to the latter by
 setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK
 specific counterparts) to `1` in `config.h`.

 **For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.**

 (Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases
 will segfault or return errors on the order of 1 or larger.)


 BLAS extension `xgemmt`
 -----------------------
 The LDL decompositions require a general matrix-matrix product that updates only
 a triangular matrix called `xgemmt`.  If the BLAS implementation linked against
 provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`;
 otherwise, ReLAPACK uses its own recursive implementation of these kernels.

 `xgemmt` is provided by MKL.


 Routine Selection
 -----------------
 ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`).  If the
 corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to
 `1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g.,
 `dgetrf_`).  By default, wrappers for all routines are enabled.


 Crossover Size
 --------------
 The crossover size determines below which matrix sizes ReLAPACK's recursive
 algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3
 routines.  The crossover size is set in `config.h` and can be chosen either
 globally for the entire library, by operation, or individually by routine.


 Allowing Temporary Buffers
 --------------------------
 Two of ReLAPACK's routines make use of temporary buffers, which are allocated
 and freed within ReLAPACK.  Setting `ALLOW_MALLOC` (or one of the routine
 specific counterparts) to 0 in `config.h` will disable these buffers.  The
 affected routines are:

 * `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2.  As in
   LAPACK, this size can be queried by setting `lWork = -1` and the passed
   buffer will be used if it is large enough; only if it is not, a local buffer
   will be allocated.

   The advantage of this mechanism is that ReLAPACK will seamlessly work even
   with codes that statically provide too little memory instead of breaking
   them.

 * `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem
   to standard form can use an auxiliary buffer of size n^2 / 2 to avoid
   redundant computations.  It thereby performs about 30% less FLOPs than
   LAPACK.


 FORTRAN symbol names
 --------------------
 ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces.
 Since these libraries usually have an underscore to their symbol names, ReLAPACK
 has configuration switches in `config.h` to adjust the corresponding routine
 names.
--- a/relapack/coverage.md
+++ b/relapack/coverage.md
@@ -0,0 +1,212 @@
 Coverage of ReLAPACK
 ====================

 This file lists all LAPACK compute routines that are covered by recursive
 algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which
 are not (yet) part of ReLAPACK.

 <!-- START doctoc generated TOC please keep comment here to allow auto update -->
 <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
 **Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*

 - [List of covered LAPACK routines](#list-of-covered-lapack-routines)
  - [`xlauum`](#xlauum)
  - [`xsygst`](#xsygst)
  - [`xtrtri`](#xtrtri)
  - [`xpotrf`](#xpotrf)
  - [`xpbtrf`](#xpbtrf)
  - [`xsytrf`](#xsytrf)
  - [`xgetrf`](#xgetrf)
  - [`xgbtrf`](#xgbtrf)
  - [`xtrsyl`](#xtrsyl)
  - [`xtgsyl`](#xtgsyl)
 - [Covered BLAS extension](#covered-blas-extension)
  - [`xgemmt`](#xgemmt)
 - [Not covered yet](#not-covered-yet)
  - [`xpstrf`](#xpstrf)
 - [Not covered: extra FLOPs](#not-covered-extra-flops)
  - [QR decomposition (and related)](#qr-decomposition-and-related)
  - [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal)
  - [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal)
  - [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg)

 <!-- END doctoc generated TOC please keep comment here to allow auto update -->


 List of covered LAPACK routines
 -------------------------------

 ### `xlauum`
 Multiplication of a triangular matrix with its (complex conjugate) transpose,
 resulting in a symmetric (Hermitian) matrix.

 Routines: `slauum`, `dlauum`, `clauum`, `zlauum`

 Operations:
 * A = L^T L
 * A = U U^T

 ### `xsygst`
 Simultaneous two-sided multiplication of a symmetric matrix with a triangular
 matrix and its transpose

 Routines: `ssygst`, `dsygst`, `chegst`, `zhegst`

 Operations:
 * A = inv(L) A inv(L^T)
 * A = inv(U^T) A inv(U)
 * A = L^T A L
 * A = U A U^T

 ### `xtrtri`
 Inversion of a triangular matrix

 Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri`

 Operations:
 * L = inv(L)
 * U = inv(U)

 ### `xpotrf`
 Cholesky decomposition of a symmetric (Hermitian) positive definite matrix

 Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf`

 Operations:
 * L L^T = A
 * U^T U = A

 ### `xpbtrf`
 Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix

 Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf`

 Operations:
 * L L^T = A
 * U^T U = A

 ### `xsytrf`
 LDL decomposition of a symmetric (or Hermitian) matrix

 Routines:
 * `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`,
 * `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`,
  `zhetrf_rook`

 Operations:
 * L D L^T = A
 * U^T D U = A

 ### `xgetrf`
 LU decomposition of a general matrix with pivoting

 Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf`

 Operation: P L U = A

 ### `xgbtrf`
 LU decomposition of a general banded matrix with pivoting

 Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf`

 Operation: L U = A

 ### `xtrsyl`
 Solution of the quasi-triangular Sylvester equation

 Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl`

 Operations:
 * A X + B Y = C -> X
 * A^T X + B Y = C -> X
 * A X + B^T Y = C -> X
 * A^T X + B^T Y = C -> X
 * A X - B Y = C -> X
 * A^T X - B Y = C -> X
 * A X - B^T Y = C -> X
 * A^T X - B^T Y = C -> X

 ### `xtgsyl`
 Solution of the generalized Sylvester equations

 Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl`

 Operations:
 * A R - L B = C, D R - L E = F -> L, R
 * A^T R + D^T L = C, R B^T - L E^T = -F -> L, R


 Covered BLAS extension
 ----------------------

 ### `xgemmt`
 Matrix-matrix product updating only a triangular part of the result

 Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt`

 Operations:
 * C = alpha A B + beta C
 * C = alpha A B^T + beta C
 * C = alpha A^T B + beta C
 * C = alpha A^T B^T + beta C


 Not covered yet
 ---------------
 The following operation is implemented as a blocked algorithm in LAPACK but
 currently not yet covered in ReLAPACK as a recursive algorithm

 ### `xpstrf`
 Cholesky decomposition of a positive semi-definite matrix with complete pivoting.

 Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf`

 Operations:
 * P L L^T P^T = A
 * P U^T U P^T = A


 Not covered: extra FLOPs
 ------------------------
 The following routines are not covered because recursive variants would require
 considerably more FLOPs or operate on banded matrices.

 ### QR decomposition (and related)
 Routines:
 * `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf`
 * `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf`
 * `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf`
 * `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf`
 * `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf`

 Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A

 Routines for multiplication with Q:
 * `sormqr`, `dormqr`, `cunmqr`, `zunmqr`
 * `sormrq`, `dormrq`, `cunmrq`, `zunmrq`
 * `sormql`, `dormql`, `cunmql`, `zunmql`
 * `sormlq`, `dormlq`, `cunmlq`, `zunmlq`
 * `sormrz`, `dormrz`, `cunmrz`, `zunmrz`

 Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T

 Routines for construction of Q:
 * `sorgqr`, `dorgqr`, `cungqr`, `zungqr`
 * `sorgrq`, `dorgrq`, `cungrq`, `zungrq`
 * `sorgql`, `dorgql`, `cungql`, `zungql`
 * `sorglq`, `dorglq`, `cunglq`, `zunglq`

 ### Symmetric reduction to tridiagonal
 Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd`

 Operation: Q T Q^T = A

 ### Symmetric reduction to bidiagonal
 Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd`

 Operation: Q T P^T = A

 ### Reduction to upper Hessenberg
 Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd`

 Operation: Q H Q^T = A
--- a/relapack/inc/relapack.h
+++ b/relapack/inc/relapack.h
@@ -0,0 +1,67 @@
 #ifndef RELAPACK_H
 #define RELAPACK_H

 void RELAPACK_slauum(const char *, const int *, float *, const int *, int *);
 void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *);
 void RELAPACK_clauum(const char *, const int *, float *, const int *, int *);
 void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *);

 void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *);
 void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *);
 void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *);
 void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *);

 void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *);
 void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *);
 void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *);
 void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *);

 void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *);
 void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *);
 void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *);
 void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *);

 void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);

 void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *);
 void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *);
 void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *);
 void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *);

 void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
 void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
 void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
 void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);

 void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
 void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
 void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
 void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);

 void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
 void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
 void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
 void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);

 void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
 void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
 void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
 void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);

 void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);

 #endif /*  RELAPACK_H */
--- a/relapack/src/blas.h
+++ b/relapack/src/blas.h
@@ -0,0 +1,61 @@
 #ifndef BLAS_H
 #define BLAS_H

 extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *);
 extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *);
 extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *);
 extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *);

 extern void BLAS(sscal)(const int *, const float *, float *, const int *);
 extern void BLAS(dscal)(const int *, const double *, double *, const int *);
 extern void BLAS(cscal)(const int *, const float *, float *, const int *);
 extern void BLAS(zscal)(const int *, const double *, double *, const int *);

 extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
 extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);

 extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
 extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);

 extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
 extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);

 extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
 extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);

 extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
 extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);

 extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
 extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
 extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
 extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);

 extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);

 extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);

 #if HAVE_XGEMMT
 extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
 extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
 #endif

 #endif /* BLAS_H */
--- a/relapack/src/cgbtrf.c
+++ b/relapack/src/cgbtrf.c
@@ -0,0 +1,230 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
    const int *, float *, const int *, int *, float *, const int *, float *,
    const int *, int *);


 /** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's cgbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html
 * */
 void RELAPACK_cgbtrf(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kl < 0)
        *info = -3;
    else if (*ku < 0)
        *info = -4;
    else if (*ldAb < 2 * *kl + *ku + 1)
        *info = -6;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CGBTRF", &minfo);
        return;
    }

    // Constant
    const float ZERO[] = { 0., 0. };

    // Result upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    float *const A = Ab + 2 * kv;

    // Zero upper diagonal fill-in elements
    int i, j;
    for (j = 0; j < *n; j++) {
        float *const A_j = A + 2 * *ldA * j;
        for (i = MAX(0, j - kv); i < j - *ku; i++)
            A_j[2 * i] = A_j[2 * i + 1] = 0.;
    }

    // Allocate work space
    const int n1 = CREC_SPLIT(*n);
    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
    const int nWorkl = (kv > n1) ? n1 : kv;
    const int mWorku = (*kl > n1) ? n1 : *kl;
    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
    float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float));
    float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float));
    LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
    LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);

    // Recursive kernel
    RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);

    // Free work space
    free(Workl);
    free(Worku);
 }


 /** cgbtrf's recursive compute kernel */
 static void RELAPACK_cgbtrf_rec(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_CGBTRF, 1)) {
        // Unblocked
        LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    // Loop iterators
    int i, j;

    // Output upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    float *const A = Ab + 2 * kv;

    // Splitting
    const int n1  = MIN(CREC_SPLIT(*n), *kl);
    const int n2  = *n - n1;
    const int m1  = MIN(n1, *m);
    const int m2  = *m - m1;
    const int mn1 = MIN(m1, n1);
    const int mn2 = MIN(m2, n2);

    // Ab_L *
    //      Ab_BR
    float *const Ab_L  = Ab;
    float *const Ab_BR = Ab + 2 * *ldAb * n1;

    // A_L A_R
    float *const A_L = A;
    float *const A_R = A + 2 * *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * m1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * m1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // Banded splitting
    const int n21 = MIN(n2, kv - n1);
    const int n22 = MIN(n2 - n21, n1);
    const int m21 = MIN(m2, *kl - m1);
    const int m22 = MIN(m2 - m21, m1);

    //   n1 n21  n22
    // m *  A_Rl ARr
    float *const A_Rl = A_R;
    float *const A_Rr = A_R + 2 * *ldA * n21;

    //     n1    n21    n22
    // m1  *     A_TRl  A_TRr
    // m21 A_BLt A_BRtl A_BRtr
    // m22 A_BLb A_BRbl A_BRbr
    float *const A_TRl  = A_TR;
    float *const A_TRr  = A_TR + 2 * *ldA * n21;
    float *const A_BLt  = A_BL;
    float *const A_BLb  = A_BL                  + 2 * m21;
    float *const A_BRtl = A_BR;
    float *const A_BRtr = A_BR + 2 * *ldA * n21;
    float *const A_BRbl = A_BR                  + 2 * m21;
    float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;

    // recursion(Ab_L, ipiv_T)
    RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);

    // Workl = A_BLb
    LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);

    // partially redo swaps in A_L
    for (i = 0; i < mn1; i++) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
            else
                BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
        }
    }

    // apply pivots to A_Rl
    LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);

    // apply pivots to A_Rr columnwise
    for (j = 0; j < n22; j++) {
        float *const A_Rrj = A_Rr + 2 * *ldA * j;
        for (i = j; i < mn1; i++) {
            const int ip = ipiv_T[i] - 1;
            if (ip != i) {
                const float tmpr = A_Rrj[2 * i];
                const float tmpc = A_Rrj[2 * i + 1];
                A_Rrj[2 * i]     = A_Rrj[2 * ip];
                A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1];
                A_Rrj[2 * ip]     = tmpr;
                A_Rrj[2 * ip + 1] = tmpc;
            }
        }
    }

    // A_TRl = A_TL \ A_TRl
    BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
    // Worku = A_TRr
    LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
    // Worku = A_TL \ Worku
    BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
    // A_TRr = Worku
    LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
    // A_BRtl = A_BRtl - A_BLt * A_TRl
    BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
    // A_BRbl = A_BRbl - Workl * A_TRl
    BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
    // A_BRtr = A_BRtr - A_BLt * Worku
    BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
    // A_BRbr = A_BRbr - Workl * Worku
    BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);

    // partially undo swaps in A_L
    for (i = mn1 - 1; i >= 0; i--) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
            else
                BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
        }
    }

    // recursion(Ab_BR, ipiv_B)
    RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
    if (*info)
        *info += n1;
    // shift pivots
    for (i = 0; i < mn2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/cgemmt.c
+++ b/relapack/src/cgemmt.c
@@ -0,0 +1,167 @@
 #include "relapack.h"

 static void RELAPACK_cgemmt_rec(const char *, const char *, const char *,
    const int *, const int *, const float *, const float *, const int *,
    const float *, const int *, const float *, float *, const int *);

 static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
    const int *, const int *, const float *, const float *, const int *,
    const float *, const int *, const float *, float *, const int *);


 /** CGEMMT computes a matrix-matrix product with general matrices but updates
 * only the upper or lower triangular part of the result matrix.
 *
 * This routine performs the same operation as the BLAS routine
 * cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
 * but only updates the triangular part of C specified by uplo:
 * If (*uplo == 'L'), only the lower triangular part of C is updated,
 * otherwise the upper triangular part is updated.
 * */
 void RELAPACK_cgemmt(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {

 #if HAVE_XGEMMT
    BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
    return;
 #else

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int notransA = LAPACK(lsame)(transA, "N");
    const int tranA = LAPACK(lsame)(transA, "T");
    const int ctransA = LAPACK(lsame)(transA, "C");
    const int notransB = LAPACK(lsame)(transB, "N");
    const int tranB = LAPACK(lsame)(transB, "T");
    const int ctransB = LAPACK(lsame)(transB, "C");
    int info = 0;
    if (!lower && !upper)
        info = 1;
    else if (!tranA && !ctransA && !notransA)
        info = 2;
    else if (!tranB && !ctransB && !notransB)
        info = 3;
    else if (*n < 0)
        info = 4;
    else if (*k < 0)
        info = 5;
    else if (*ldA < MAX(1, notransA ? *n : *k))
        info = 8;
    else if (*ldB < MAX(1, notransB ? *k : *n))
        info = 10;
    else if (*ldC < MAX(1, *n))
        info = 13;
    if (info) {
        LAPACK(xerbla)("CGEMMT", &info);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';
    const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
    const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');

    // Recursive kernel
    RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
 #endif
 }


 /** cgemmt's recursive compute kernel */
 static void RELAPACK_cgemmt_rec(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {

    if (*n <= MAX(CROSSOVER_CGEMMT, 1)) {
        // Unblocked
        RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
        return;
    }

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_T
    // A_B
    const float *const A_T = A;
    const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);

    // B_L B_R
    const float *const B_L = B;
    const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);

    // C_TL C_TR
    // C_BL C_BR
    float *const C_TL = C;
    float *const C_TR = C + 2 * *ldC * n1;
    float *const C_BL = C                 + 2 * n1;
    float *const C_BR = C + 2 * *ldC * n1 + 2 * n1;

    // recursion(C_TL)
    RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);

    if (*uplo == 'L')
        // C_BL = alpha A_B B_L + beta C_BL
        BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
    else
        // C_TR = alpha A_T B_R + beta C_TR
        BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);

    // recursion(C_BR)
    RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
 }


 /** cgemmt's unblocked compute kernel */
 static void RELAPACK_cgemmt_rec2(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {

    const int incB = (*transB == 'N') ? 1 : *ldB;
    const int incC = 1;

    int i;
    for (i = 0; i < *n; i++) {
        // A_0
        // A_i
        const float *const A_0 = A;
        const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);

        // * B_i *
        const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);

        // * C_0i *
        // * C_ii *
        float *const C_0i = C + 2 * *ldC * i;
        float *const C_ii = C + 2 * *ldC * i + 2 * i;

        if (*uplo == 'L') {
            const int nmi = *n - i;
            if (*transA == 'N')
                BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
            else
                BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
        } else {
            const int ip1 = i + 1;
            if (*transA == 'N')
                BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
            else
                BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
        }
    }
 }
--- a/relapack/src/cgetrf.c
+++ b/relapack/src/cgetrf.c
@@ -0,0 +1,117 @@
 #include "relapack.h"

 static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
    const int *, int *, int *);


 /** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's cgetrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html
 */
 void RELAPACK_cgetrf(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CGETRF", &minfo);
        return;
    }

    const int sn = MIN(*m, *n);

    RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info);

    // Right remainder
    if (*m < *n) {
        // Constants
        const float ONE[]  = { 1., 0. };
        const int   iONE[] = { 1 };

        // Splitting
        const int rn = *n - *m;

        // A_L A_R
        const float *const A_L = A;
        float *const       A_R = A + 2 * *ldA * *m;

        // A_R = apply(ipiv, A_R)
        LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
        // A_R = A_L \ A_R
        BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
    }
 }


 /** cgetrf's recursive compute kernel */
 static void RELAPACK_cgetrf_rec(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_CGETRF, 1)) {
        // Unblocked
        LAPACK(cgetf2)(m, n, A, ldA, ipiv, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;
    const int m2 = *m - n1;

    // A_L A_R
    float *const A_L = A;
    float *const A_R = A + 2 * *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // recursion(A_L, ipiv_T)
    RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
    // apply pivots to A_R
    LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);

    // A_TR = A_TL \ A_TR
    BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
    // A_BR = A_BR - A_BL * A_TR
    BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);

    // recursion(A_BR, ipiv_B)
    RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
    if (*info)
        *info += n1;
    // apply pivots to A_BL
    LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
    // shift pivots
    int i;
    for (i = 0; i < n2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/chegst.c
+++ b/relapack/src/chegst.c
@@ -0,0 +1,212 @@
 #include "relapack.h"
 #if XSYGST_ALLOW_MALLOC
 #include "stdlib.h"
 #endif

 static void RELAPACK_chegst_rec(const int *, const char *, const int *,
    float *, const int *, const float *, const int *,
    float *, const int *, int *);


 /** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
 *
 * This routine is functionally equivalent to LAPACK's chegst.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html
 * */
 void RELAPACK_chegst(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (*itype < 1 || *itype > 3)
        *info = -1;
    else if (!lower && !upper)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    else if (*ldB < MAX(1, *n))
        *info = -7;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CHEGST", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Allocate work space
    float *Work = NULL;
    int   lWork = 0;
 #if XSYGST_ALLOW_MALLOC
    const int n1 = CREC_SPLIT(*n);
    lWork = n1 * (*n - n1);
    Work  = malloc(lWork * 2 * sizeof(float));
    if (!Work)
        lWork = 0;
 #endif

    // recursive kernel
    RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);

    // Free work space
 #if XSYGST_ALLOW_MALLOC
    if (Work)
        free(Work);
 #endif
 }


 /** chegst's recursive compute kernel */
 static void RELAPACK_chegst_rec(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    float *Work, const int *lWork, int *info
 ) {

    if (*n <= MAX(CROSSOVER_CHEGST, 1)) {
        // Unblocked
        LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info);
        return;
    }

    // Constants
    const float ZERO[]  = { 0., 0. };
    const float ONE[]   = { 1., 0. };
    const float MONE[]  = { -1., 0. };
    const float HALF[]  = { .5, 0. };
    const float MHALF[] = { -.5, 0. };
    const int   iONE[]  = { 1 };

    // Loop iterator
    int i;

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // B_TL B_TR
    // B_BL B_BR
    const float *const B_TL = B;
    const float *const B_TR = B + 2 * *ldB * n1;
    const float *const B_BL = B                 + 2 * n1;
    const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;

    // recursion(A_TL, B_TL)
    RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);

    if (*itype == 1)
        if (*uplo == 'L') {
            // A_BL = A_BL / B_TL'
            BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork > n2 * n1) {
                // T = -1/2 * B_BL * A_TL
                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            } else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
            BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
            if (*lWork > n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR \ A_BL
            BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL' \ A_TR
            BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork > n2 * n1) {
                // T = -1/2 * A_TL * B_TR
                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            } else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
            BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
            if (*lWork > n2 * n1)
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR / B_BR
            BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }
    else
        if (*uplo == 'L') {
            // A_BL = A_BL * B_TL
            BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork > n2 * n1) {
                // T = 1/2 * A_BR * B_BL
                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            } else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
            BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
            if (*lWork > n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR * A_BL
            BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL * A_TR
            BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork > n2 * n1) {
                // T = 1/2 * B_TR * A_BR
                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            } else
                // A_TR = A_TR + 1/2 B_TR A_BR
                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
            BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
            if (*lWork > n2 * n1)
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            else
                // A_TR = A_TR + 1/2 B_TR * A_BR
                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR * B_BR
            BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }

    // recursion(A_BR, B_BR)
    RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
 }
--- a/relapack/src/chetrf.c
+++ b/relapack/src/chetrf.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's chetrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html
 * */
 void RELAPACK_chetrf(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CHETRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** chetrf's recursive compute kernel */
 static void RELAPACK_chetrf_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = CREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A                 + 2 * n1;
        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        float *const A_BL_B = A                 + 2 * *n;
        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                    + 2 * n1;
        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = CREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + 2 * *ldA * n_rest;
        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/chetrf_rec2.c
+++ b/relapack/src/chetrf_rec2.c
@@ -0,0 +1,520 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static complex c_b1 = {1.f,0.f};
 static int c__1 = 1;

 /** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
 *
 * This routine is a minor modification of LAPACK's clahef.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int *
 	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
 	int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    float r__1, r__2, r__3, r__4;
    complex q__1, q__2, q__3, q__4;

    /* Builtin functions */
    double sqrt(double), r_imag(complex *);
    void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);

    /* Local variables */
    static int j, k;
    static float t, r1;
    static complex d11, d21, d22;
    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
    static float alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
 	    , complex *, int *, complex *, int *, complex *, complex *
 	    , int *, ftnlen), ccopy_(int *, complex *, int *,
 	    complex *, int *), cswap_(int *, complex *, int *,
 	    complex *, int *);
    static int kstep;
    static float absakk;
    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
    extern int icamax_(int *, complex *, int *);
    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
 	    *);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	i__1 = k - 1;
 	ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	i__1 = k + kw * w_dim1;
 	i__2 = k + k * a_dim1;
 	r__1 = a[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	    i__1 = k + kw * w_dim1;
 	    i__2 = k + kw * w_dim1;
 	    r__1 = w[i__2].r;
 	    w[i__1].r = r__1, w[i__1].i = 0.f;
 	}
 	i__1 = k + kw * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + kw * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + k * a_dim1;
 	    r__1 = a[i__2].r;
 	    a[i__1].r = r__1, a[i__1].i = 0.f;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - 1;
 		ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = imax + (kw - 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		r__1 = a[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		i__1 = k - imax;
 		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		i__1 = k - imax;
 		clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    i__2 = imax + (kw - 1) * w_dim1;
 		    r__1 = w[i__2].r;
 		    w[i__1].r = r__1, w[i__1].i = 0.f;
 		}
 		i__1 = k - imax;
 		jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
 			 &c__1);
 		i__1 = jmax + (kw - 1) * w_dim1;
 		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			jmax + (kw - 1) * w_dim1]), dabs(r__2));
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
 			    r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
 			    r__2));
 		    rowmax = dmax(r__3,r__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
 			kp = imax;
 			ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 				w_dim1 + 1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = kk - 1 - kp;
 		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		i__1 = kk - 1 - kp;
 		clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    i__1 = k + k * a_dim1;
 		    r1 = 1.f / a[i__1].r;
 		    i__1 = k - 1;
 		    csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    i__1 = k - 1;
 		    clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		}
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    r_cnjg(&q__2, &d21);
 		    c_div(&q__1, &w[k + kw * w_dim1], &q__2);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1.f / (q__1.r - 1.f);
 		    q__2.r = t, q__2.i = 0.f;
 		    c_div(&q__1, &q__2, &d21);
 		    d21.r = q__1.r, d21.i = q__1.i;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + k * a_dim1;
 			r_cnjg(&q__2, &d21);
 			i__3 = j + kw * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
 				q__2.r * q__3.i + q__2.i * q__3.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1;
 		clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		i__1 = k - 2;
 		clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    ++j;
 	}
 	++j;
 	if (jp != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
 	}
 	if (j <= *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	i__1 = k + k * w_dim1;
 	i__2 = k + k * a_dim1;
 	r__1 = a[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
 		    w_dim1], &c__1);
 	}
 	i__1 = *n - k + 1;
 	i__2 = k - 1;
 	q__1.r = -1.f, q__1.i = -0.f;
 	cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
 		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
 	i__1 = k + k * w_dim1;
 	i__2 = k + k * w_dim1;
 	r__1 = w[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	i__1 = k + k * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + k * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + k * a_dim1;
 	    r__1 = a[i__2].r;
 	    a[i__1].r = r__1, a[i__1].i = 0.f;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - k;
 		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = imax - k;
 		clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
 		i__1 = imax + (k + 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		r__1 = a[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
 			    imax + 1 + (k + 1) * w_dim1], &c__1);
 		}
 		i__1 = *n - k + 1;
 		i__2 = k - 1;
 		q__1.r = -1.f, q__1.i = -0.f;
 		cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
 			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
 			w_dim1], &c__1, (ftnlen)12);
 		i__1 = imax + (k + 1) * w_dim1;
 		i__2 = imax + (k + 1) * w_dim1;
 		r__1 = w[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		i__1 = imax - k;
 		jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
 			;
 		i__1 = jmax + (k + 1) * w_dim1;
 		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			jmax + (k + 1) * w_dim1]), dabs(r__2));
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
 			    r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
 			    r__2));
 		    rowmax = dmax(r__3,r__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (k + 1) * w_dim1;
 		    if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
 			kp = imax;
 			i__1 = *n - k + 1;
 			ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
 				k * w_dim1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = kp - kk - 1;
 		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		i__1 = kp - kk - 1;
 		clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    i__1 = k + k * a_dim1;
 		    r1 = 1.f / a[i__1].r;
 		    i__1 = *n - k;
 		    csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    i__1 = *n - k;
 		    clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    r_cnjg(&q__2, &d21);
 		    c_div(&q__1, &w[k + k * w_dim1], &q__2);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1.f / (q__1.r - 1.f);
 		    q__2.r = t, q__2.i = 0.f;
 		    c_div(&q__1, &q__2, &d21);
 		    d21.r = q__1.r, d21.i = q__1.i;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			r_cnjg(&q__2, &d21);
 			i__3 = j + k * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
 				q__2.r * q__3.i + q__2.i * q__3.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = *n - k;
 		clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		i__1 = *n - k - 1;
 		clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    --j;
 	}
 	--j;
 	if (jp != jj && j >= 1) {
 	    cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j >= 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/chetrf_rook.c
+++ b/relapack/src/chetrf_rook.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's chetrf_rook.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html
 * */
 void RELAPACK_chetrf_rook(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CHETRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** chetrf_rook's recursive compute kernel */
 static void RELAPACK_chetrf_rook_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = CREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A                 + 2 * n1;
        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        float *const A_BL_B = A                 + 2 * *n;
        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                    + 2 * n1;
        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = CREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + 2 * *ldA * n_rest;
        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/chetrf_rook_rec2.c
+++ b/relapack/src/chetrf_rook_rec2.c
@@ -0,0 +1,661 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static complex c_b1 = {1.f,0.f};
 static int c__1 = 1;

 /** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
 *
 * This routine is a minor modification of LAPACK's clahef_rook.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n,
 	int *nb, int *kb, complex *a, int *lda, int *ipiv,
 	complex *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    float r__1, r__2;
    complex q__1, q__2, q__3, q__4, q__5;

    /* Builtin functions */
    double sqrt(double), r_imag(complex *);
    void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);

    /* Local variables */
    static int j, k, p;
    static float t, r1;
    static complex d11, d21, d22;
    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
    static logical done;
    static int imax, jmax;
    static float alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
 	    , complex *, int *, complex *, int *, complex *, complex *
 	    , int *, ftnlen);
    static float sfmin;
    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
 	    complex *, int *);
    static int itemp;
    extern /* Subroutine */ int cswap_(int *, complex *, int *,
 	    complex *, int *);
    static int kstep;
    static float stemp, absakk;
    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
    extern int icamax_(int *, complex *, int *);
    extern double slamch_(char *, ftnlen);
    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
 	    *);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    sfmin = slamch_("S", (ftnlen)1);
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	p = k;
 	if (k > 1) {
 	    i__1 = k - 1;
 	    ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
 		    c__1);
 	}
 	i__1 = k + kw * w_dim1;
 	i__2 = k + k * a_dim1;
 	r__1 = a[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	    i__1 = k + kw * w_dim1;
 	    i__2 = k + kw * w_dim1;
 	    r__1 = w[i__2].r;
 	    w[i__1].r = r__1, w[i__1].i = 0.f;
 	}
 	i__1 = k + kw * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + kw * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + kw * w_dim1;
 	    r__1 = w[i__2].r;
 	    a[i__1].r = r__1, a[i__1].i = 0.f;
 	    if (k > 1) {
 		i__1 = k - 1;
 		ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
 			&c__1);
 	    }
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L12:
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			    w_dim1 + 1], &c__1);
 		}
 		i__1 = imax + (kw - 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		r__1 = a[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		i__1 = k - imax;
 		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		i__1 = k - imax;
 		clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    i__2 = imax + (kw - 1) * w_dim1;
 		    r__1 = w[i__2].r;
 		    w[i__1].r = r__1, w[i__1].i = 0.f;
 		}
 		if (imax != k) {
 		    i__1 = k - imax;
 		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
 			    w_dim1], &c__1);
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 		    i__1 = itemp + (kw - 1) * w_dim1;
 		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (kw - 1) * w_dim1;
 		if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
 		    kp = imax;
 		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		}
 		if (! done) {
 		    goto L12;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kstep == 2 && p != k) {
 		i__1 = p + p * a_dim1;
 		i__2 = k + k * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = k - 1 - p;
 		ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
 			a_dim1], lda);
 		i__1 = k - 1 - p;
 		clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
 		if (p > 1) {
 		    i__1 = p - 1;
 		    ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
 			    1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
 			    1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
 			 ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = kk - 1 - kp;
 		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		i__1 = kk - 1 - kp;
 		clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    i__1 = k + k * a_dim1;
 		    t = a[i__1].r;
 		    if (dabs(t) >= sfmin) {
 			r1 = 1.f / t;
 			i__1 = k - 1;
 			csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    } else {
 			i__1 = k - 1;
 			for (ii = 1; ii <= i__1; ++ii) {
 			    i__2 = ii + k * a_dim1;
 			    i__3 = ii + k * a_dim1;
 			    q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
 			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L14: */
 			}
 		    }
 		    i__1 = k - 1;
 		    clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		}
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    r_cnjg(&q__2, &d21);
 		    c_div(&q__1, &w[k + kw * w_dim1], &q__2);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1.f / (q__1.r - 1.f);
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d21);
 			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + k * a_dim1;
 			i__3 = j + kw * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			r_cnjg(&q__5, &d21);
 			c_div(&q__2, &q__3, &q__5);
 			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1;
 		clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		i__1 = k - 2;
 		clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    ++j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	++j;
 	if (jp2 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	++jj;
 	if (kstep == 2 && jp1 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	if (j < *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	p = k;
 	i__1 = k + k * w_dim1;
 	i__2 = k + k * a_dim1;
 	r__1 = a[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
 		    w_dim1], &c__1);
 	}
 	if (k > 1) {
 	    i__1 = *n - k + 1;
 	    i__2 = k - 1;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
 		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
 		    ftnlen)12);
 	    i__1 = k + k * w_dim1;
 	    i__2 = k + k * w_dim1;
 	    r__1 = w[i__2].r;
 	    w[i__1].r = r__1, w[i__1].i = 0.f;
 	}
 	i__1 = k + k * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + k * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + k * w_dim1;
 	    r__1 = w[i__2].r;
 	    a[i__1].r = r__1, a[i__1].i = 0.f;
 	    if (k < *n) {
 		i__1 = *n - k;
 		ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
 			a_dim1], &c__1);
 	    }
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L72:
 		i__1 = imax - k;
 		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = imax - k;
 		clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
 		i__1 = imax + (k + 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		r__1 = a[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
 			    imax + 1 + (k + 1) * w_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = *n - k + 1;
 		    i__2 = k - 1;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
 			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
 			    1) * w_dim1], &c__1, (ftnlen)12);
 		    i__1 = imax + (k + 1) * w_dim1;
 		    i__2 = imax + (k + 1) * w_dim1;
 		    r__1 = w[i__2].r;
 		    w[i__1].r = r__1, w[i__1].i = 0.f;
 		}
 		if (imax != k) {
 		    i__1 = imax - k;
 		    jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
 			    c__1);
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[jmax + (k + 1) * w_dim1]), dabs(r__2));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 		    i__1 = itemp + (k + 1) * w_dim1;
 		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[itemp + (k + 1) * w_dim1]), dabs(r__2));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (k + 1) * w_dim1;
 		if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    i__1 = *n - k + 1;
 		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		}
 		if (! done) {
 		    goto L72;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kstep == 2 && p != k) {
 		i__1 = p + p * a_dim1;
 		i__2 = k + k * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = p - k - 1;
 		ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
 			a_dim1], lda);
 		i__1 = p - k - 1;
 		clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
 		if (p < *n) {
 		    i__1 = *n - p;
 		    ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
 			    * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
 		}
 		cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = kp - kk - 1;
 		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		i__1 = kp - kk - 1;
 		clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    i__1 = k + k * a_dim1;
 		    t = a[i__1].r;
 		    if (dabs(t) >= sfmin) {
 			r1 = 1.f / t;
 			i__1 = *n - k;
 			csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    } else {
 			i__1 = *n;
 			for (ii = k + 1; ii <= i__1; ++ii) {
 			    i__2 = ii + k * a_dim1;
 			    i__3 = ii + k * a_dim1;
 			    q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
 			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L74: */
 			}
 		    }
 		    i__1 = *n - k;
 		    clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    r_cnjg(&q__2, &d21);
 		    c_div(&q__1, &w[k + k * w_dim1], &q__2);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1.f / (q__1.r - 1.f);
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			i__3 = j + k * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			r_cnjg(&q__5, &d21);
 			c_div(&q__2, &q__3, &q__5);
 			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d21);
 			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = *n - k;
 		clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		i__1 = *n - k - 1;
 		clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    --j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	--j;
 	if (jp2 != jj && j >= 1) {
 	    cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	--jj;
 	if (kstep == 2 && jp1 != jj && j >= 1) {
 	    cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j > 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/clauum.c
+++ b/relapack/src/clauum.c
@@ -0,0 +1,87 @@
 #include "relapack.h"

 static void RELAPACK_clauum_rec(const char *, const int *, float *,
    const int *, int *);


 /** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
 *
 * This routine is functionally equivalent to LAPACK's clauum.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html
 * */
 void RELAPACK_clauum(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CLAUUM", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info);
 }


 /** clauum's recursive compute kernel */
 static void RELAPACK_clauum_rec(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_CLAUUM, 1)) {
        // Unblocked
        LAPACK(clauu2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const float ONE[] = { 1., 0. };

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info);

    if (*uplo == 'L') {
        // A_TL = A_TL + A_BL' * A_BL
        BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
        // A_BL = A_BR' * A_BL
        BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TL = A_TL + A_TR * A_TR'
        BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
        // A_TR = A_TR * A_BR'
        BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info);
 }
--- a/relapack/src/cpbtrf.c
+++ b/relapack/src/cpbtrf.c
@@ -0,0 +1,157 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
    float *, const int *, float *, const int *, int *);


 /** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
 *
 * This routine is functionally equivalent to LAPACK's cpbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html
 * */
 void RELAPACK_cpbtrf(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kd < 0)
        *info = -3;
    else if (*ldAb < *kd + 1)
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CPBTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Constant
    const float ZERO[] = { 0., 0. };

    // Allocate work space
    const int n1 = CREC_SPLIT(*n);
    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
    float *Work = malloc(mWork * nWork * 2 * sizeof(float));
    LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);

    // Recursive kernel
    RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);

    // Free work space
    free(Work);
 }


 /** cpbtrf's recursive compute kernel */
 static void RELAPACK_cpbtrf_rec(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    float *Work, const int *ldWork,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_CPBTRF, 1)) {
        // Unblocked
        LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);

    // Splitting
    const int n1 = MIN(CREC_SPLIT(*n), *kd);
    const int n2 = *n - n1;

    // * *
    // * Ab_BR
    float *const Ab_BR = Ab + 2 * *ldAb * n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    // Banded splitting
    const int n21 = MIN(n2, *kd - n1);
    const int n22 = MIN(n2 - n21, *kd);

    //     n1    n21    n22
    // n1  *     A_TRl  A_TRr
    // n21 A_BLt A_BRtl A_BRtr
    // n22 A_BLb A_BRbl A_BRbr
    float *const A_TRl  = A_TR;
    float *const A_TRr  = A_TR + 2 * *ldA * n21;
    float *const A_BLt  = A_BL;
    float *const A_BLb  = A_BL                   + 2 * n21;
    float *const A_BRtl = A_BR;
    float *const A_BRtr = A_BR + 2 * *ldA * n21;
    float *const A_BRbl = A_BR                   + 2 * n21;
    float *const A_BRbr = A_BR + 2 * *ldA * n21  + 2 * n21;

    if (*uplo == 'L') {
        // A_BLt = ABLt / A_TL'
        BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
        // A_BRtl = A_BRtl - A_BLt * A_BLt'
        BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
        // Work = A_BLb
        LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
        // Work = Work / A_TL'
        BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
        // A_BRbl = A_BRbl - Work * A_BLt'
        BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
        // A_BRbr = A_BRbr - Work * Work'
        BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_BLb = Work
        LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
    } else {
        // A_TRl = A_TL' \ A_TRl
        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
        // A_BRtl = A_BRtl - A_TRl' * A_TRl
        BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
        // Work = A_TRr
        LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
        // Work = A_TL' \ Work
        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
        // A_BRtr = A_BRtr - A_TRl' * Work
        BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
        // A_BRbr = A_BRbr - Work' * Work
        BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_TRr = Work
        LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
    }

    // recursion(A_BR)
    if (*kd > n1)
        RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info);
    else
        RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/cpotrf.c
+++ b/relapack/src/cpotrf.c
@@ -0,0 +1,92 @@
 #include "relapack.h"

 static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
        const int *, int *);


 /** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
 *
 * This routine is functionally equivalent to LAPACK's cpotrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html
 * */
 void RELAPACK_cpotrf(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CPOTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info);
 }


 /** cpotrf's recursive compute kernel */
 static void RELAPACK_cpotrf_rec(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_CPOTRF, 1)) {
        // Unblocked
        LAPACK(cpotf2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = A_BL / A_TL'
        BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
        // A_BR = A_BR - A_BL * A_BL'
        BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
    } else {
        // A_TR = A_TL' \ A_TR
        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
        // A_BR = A_BR - A_TR' * A_TR
        BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/csytrf.c
+++ b/relapack/src/csytrf.c
@@ -0,0 +1,238 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's csytrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html
 * */
 void RELAPACK_csytrf(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy arguments
    int nout;

    // Recursive kernel
    RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** csytrf's recursive compute kernel */
 static void RELAPACK_csytrf_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_CSYTRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    // Loop iterator
    int i;

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = CREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A                 + 2 * n1;
        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        float *const A_BL_B = A                 + 2 * *n;
        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                    + 2 * n1;
        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = CREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1  = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + 2 * *ldA * n_rest;
        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/csytrf_rec2.c
+++ b/relapack/src/csytrf_rec2.c
@@ -0,0 +1,451 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static complex c_b1 = {1.f,0.f};
 static int c__1 = 1;

 /** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
 *
 * This routine is a minor modification of LAPACK's clasyf.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int *
 	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
 	int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    float r__1, r__2, r__3, r__4;
    complex q__1, q__2, q__3;

    /* Builtin functions */
    double sqrt(double), r_imag(complex *);
    void c_div(complex *, complex *, complex *);

    /* Local variables */
    static int j, k;
    static complex t, r1, d11, d21, d22;
    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
    static float alpha;
    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
 	    int *);
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
 	    , complex *, int *, complex *, int *, complex *, complex *
 	    , int *, ftnlen), ccopy_(int *, complex *, int *,
 	    complex *, int *), cswap_(int *, complex *, int *,
 	    complex *, int *);
    static int kstep;
    static float absakk;
    extern int icamax_(int *, complex *, int *);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	kstep = 1;
 	i__1 = k + kw * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
 		w_dim1]), dabs(r__2));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + kw * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		i__1 = k - imax;
 		jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
 			 &c__1);
 		i__1 = jmax + (kw - 1) * w_dim1;
 		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			jmax + (kw - 1) * w_dim1]), dabs(r__2));
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
 			    r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
 			    r__2));
 		    rowmax = dmax(r__3,r__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			    imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha *
 			     rowmax) {
 			kp = imax;
 			ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 				w_dim1 + 1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = kk - 1 - kp;
 		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
 		r1.r = q__1.r, r1.i = q__1.i;
 		i__1 = k - 1;
 		cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + kw * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
 		    c_div(&q__1, &c_b1, &q__2);
 		    t.r = q__1.r, t.i = q__1.i;
 		    c_div(&q__1, &t, &d21);
 		    d21.r = q__1.r, d21.i = q__1.i;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + k * a_dim1;
 			i__3 = j + kw * w_dim1;
 			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    ++j;
 	}
 	++j;
 	if (jp != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
 	}
 	if (j < *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	i__1 = *n - k + 1;
 	ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	i__1 = *n - k + 1;
 	i__2 = k - 1;
 	q__1.r = -1.f, q__1.i = -0.f;
 	cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
 		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
 	kstep = 1;
 	i__1 = k + k * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
 		w_dim1]), dabs(r__2));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + k * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - k;
 		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		i__1 = *n - k + 1;
 		i__2 = k - 1;
 		q__1.r = -1.f, q__1.i = -0.f;
 		cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
 			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
 			w_dim1], &c__1, (ftnlen)12);
 		i__1 = imax - k;
 		jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
 			;
 		i__1 = jmax + (k + 1) * w_dim1;
 		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			jmax + (k + 1) * w_dim1]), dabs(r__2));
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
 			    r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
 			    r__2));
 		    rowmax = dmax(r__3,r__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (k + 1) * w_dim1;
 		    if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			    imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha *
 			    rowmax) {
 			kp = imax;
 			i__1 = *n - k + 1;
 			ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
 				k * w_dim1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = kp - kk - 1;
 		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
 		    r1.r = q__1.r, r1.i = q__1.i;
 		    i__1 = *n - k;
 		    cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k + k * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
 		    c_div(&q__1, &c_b1, &q__2);
 		    t.r = q__1.r, t.i = q__1.i;
 		    c_div(&q__1, &t, &d21);
 		    d21.r = q__1.r, d21.i = q__1.i;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			i__3 = j + k * w_dim1;
 			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    --j;
 	}
 	--j;
 	if (jp != jj && j >= 1) {
 	    cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j > 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/csytrf_rook.c
+++ b/relapack/src/csytrf_rook.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's csytrf_rook.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html
 * */
 void RELAPACK_csytrf_rook(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** csytrf_rook's recursive compute kernel */
 static void RELAPACK_csytrf_rook_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) {
        // Unblocked
        if (top) {
            LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = CREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2   = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A                 + 2 * n1;
        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        float *const A_BL_B = A                 + 2 * *n;
        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                    + 2 * n1;
        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = CREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + 2 * *ldA * n_rest;
        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/csytrf_rook_rec2.c
+++ b/relapack/src/csytrf_rook_rec2.c
@@ -0,0 +1,565 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static complex c_b1 = {1.f,0.f};
 static int c__1 = 1;

 /** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
 *
 * This routine is a minor modification of LAPACK's clasyf_rook.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n,
 	int *nb, int *kb, complex *a, int *lda, int *ipiv,
 	complex *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    float r__1, r__2;
    complex q__1, q__2, q__3, q__4;

    /* Builtin functions */
    double sqrt(double), r_imag(complex *);
    void c_div(complex *, complex *, complex *);

    /* Local variables */
    static int j, k, p;
    static complex t, r1, d11, d12, d21, d22;
    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
    static logical done;
    static int imax, jmax;
    static float alpha;
    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
 	    int *);
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
 	    , complex *, int *, complex *, int *, complex *, complex *
 	    , int *, ftnlen);
    static float sfmin;
    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
 	    complex *, int *);
    static int itemp;
    extern /* Subroutine */ int cswap_(int *, complex *, int *,
 	    complex *, int *);
    static int kstep;
    static float stemp, absakk;
    extern int icamax_(int *, complex *, int *);
    extern double slamch_(char *, ftnlen);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    sfmin = slamch_("S", (ftnlen)1);
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	p = k;
 	ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	i__1 = k + kw * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
 		w_dim1]), dabs(r__2));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + kw * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L12:
 		ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = k - imax;
 		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
 			    w_dim1], &c__1);
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 		    i__1 = itemp + (kw - 1) * w_dim1;
 		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (kw - 1) * w_dim1;
 		if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha *
 			rowmax)) {
 		    kp = imax;
 		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		}
 		if (! done) {
 		    goto L12;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kstep == 2 && p != k) {
 		i__1 = k - p;
 		ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
 			a_dim1], lda);
 		ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - k + 1;
 		cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
 			 ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + k * a_dim1;
 		i__2 = kk + k * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = k - 1 - kp;
 		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
 			 lda);
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    i__1 = k + k * a_dim1;
 		    if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
 			    + k * a_dim1]), dabs(r__2)) >= sfmin) {
 			c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
 			r1.r = q__1.r, r1.i = q__1.i;
 			i__1 = k - 1;
 			cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    } else /* if(complicated condition) */ {
 			i__1 = k + k * a_dim1;
 			if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
 			    i__1 = k - 1;
 			    for (ii = 1; ii <= i__1; ++ii) {
 				i__2 = ii + k * a_dim1;
 				c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
 					a_dim1]);
 				a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L14: */
 			    }
 			}
 		    }
 		}
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d12.r = w[i__1].r, d12.i = w[i__1].i;
 		    c_div(&q__1, &w[k + kw * w_dim1], &d12);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
 		    c_div(&q__1, &c_b1, &q__2);
 		    t.r = q__1.r, t.i = q__1.i;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d12);
 			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
 				q__2.i + t.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + k * a_dim1;
 			i__3 = j + kw * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d12);
 			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
 				q__2.i + t.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    ++j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	++j;
 	if (jp2 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	jj = j - 1;
 	if (jp1 != jj && kstep == 2) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	if (j <= *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	p = k;
 	i__1 = *n - k + 1;
 	ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	if (k > 1) {
 	    i__1 = *n - k + 1;
 	    i__2 = k - 1;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
 		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
 		    ftnlen)12);
 	}
 	i__1 = k + k * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
 		w_dim1]), dabs(r__2));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + k * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = *n - k + 1;
 	    ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 		    c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L72:
 		i__1 = imax - k;
 		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		if (k > 1) {
 		    i__1 = *n - k + 1;
 		    i__2 = k - 1;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
 			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
 			    1) * w_dim1], &c__1, (ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = imax - k;
 		    jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
 			    c__1);
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[jmax + (k + 1) * w_dim1]), dabs(r__2));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 		    i__1 = itemp + (k + 1) * w_dim1;
 		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[itemp + (k + 1) * w_dim1]), dabs(r__2));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (k + 1) * w_dim1;
 		if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha *
 			rowmax)) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    i__1 = *n - k + 1;
 		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		}
 		if (! done) {
 		    goto L72;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kstep == 2 && p != k) {
 		i__1 = p - k;
 		ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - p + 1;
 		ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
 			c__1);
 		cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
 		cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + k * a_dim1;
 		i__2 = kk + k * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = kp - k - 1;
 		ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
 			* a_dim1], lda);
 		i__1 = *n - kp + 1;
 		ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
 			a_dim1], &c__1);
 		cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    i__1 = k + k * a_dim1;
 		    if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
 			    + k * a_dim1]), dabs(r__2)) >= sfmin) {
 			c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
 			r1.r = q__1.r, r1.i = q__1.i;
 			i__1 = *n - k;
 			cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    } else /* if(complicated condition) */ {
 			i__1 = k + k * a_dim1;
 			if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
 			    i__1 = *n;
 			    for (ii = k + 1; ii <= i__1; ++ii) {
 				i__2 = ii + k * a_dim1;
 				c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
 					a_dim1]);
 				a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L74: */
 			    }
 			}
 		    }
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k + k * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
 		    c_div(&q__1, &c_b1, &q__2);
 		    t.r = q__1.r, t.i = q__1.i;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			i__3 = j + k * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d21);
 			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
 				q__2.i + t.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d21);
 			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
 				q__2.i + t.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    --j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	--j;
 	if (jp2 != jj && j >= 1) {
 	    cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	jj = j + 1;
 	if (jp1 != jj && kstep == 2) {
 	    cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j >= 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/ctgsyl.c
+++ b/relapack/src/ctgsyl.c
@@ -0,0 +1,268 @@
 #include "relapack.h"
 #include <math.h>

 static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
    const int *, const float *, const int *, const float *, const int *,
    float *, const int *, const float *, const int *, const float *,
    const int *, float *, const int *, float *, float *, float *, int *);


 /** CTGSYL solves the generalized Sylvester equation.
 *
 * This routine is functionally equivalent to LAPACK's ctgsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html
 * */
 void RELAPACK_ctgsyl(
    const char *trans, const int *ijob, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dif,
    float *Work, const int *lWork, int *iWork, int *info
 ) {

    // Parse arguments
    const int notran = LAPACK(lsame)(trans, "N");
    const int tran = LAPACK(lsame)(trans, "C");

    // Compute work buffer size
    int lwmin = 1;
    if (notran && (*ijob == 1 || *ijob == 2))
        lwmin = MAX(1, 2 * *m * *n);
    *info = 0;

    // Check arguments
    if (!tran && !notran)
        *info = -1;
    else if (notran && (*ijob < 0 || *ijob > 4))
        *info = -2;
    else if (*m <= 0)
        *info = -3;
    else if (*n <= 0)
        *info = -4;
    else if (*ldA < MAX(1, *m))
        *info = -6;
    else if (*ldB < MAX(1, *n))
        *info = -8;
    else if (*ldC < MAX(1, *m))
        *info = -10;
    else if (*ldD < MAX(1, *m))
        *info = -12;
    else if (*ldE < MAX(1, *n))
        *info = -14;
    else if (*ldF < MAX(1, *m))
        *info = -16;
    else if (*lWork < lwmin && *lWork != -1)
        *info = -20;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CTGSYL", &minfo);
        return;
    }

    if (*lWork == -1) {
        // Work size query
        *Work = lwmin;
        return;
    }

    // Clean char * arguments
    const char cleantrans = notran ? 'N' : 'C';

    // Constant
    const float ZERO[] = { 0., 0. };

    int isolve = 1;
    int ifunc  = 0;
    if (notran) {
        if (*ijob >= 3) {
            ifunc = *ijob - 2;
            LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
            LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
        } else if (*ijob >= 1)
            isolve = 2;
    }

    float scale2;
    int iround;
    for (iround = 1; iround <= isolve; iround++) {
        *scale = 1;
        float dscale = 0;
        float dsum   = 1;
        RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
        if (dscale != 0) {
            if (*ijob == 1 || *ijob == 3)
                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
            else
                *dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
        }
        if (isolve == 2) {
            if (iround == 1) {
                if (notran)
                    ifunc = *ijob;
                scale2 = *scale;
                LAPACK(clacpy)("F", m, n, C, ldC, Work, m);
                LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
                LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
                LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
            } else {
                LAPACK(clacpy)("F", m, n, Work, m, C, ldC);
                LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
                *scale = scale2;
            }
        }
    }
 }


 /** ctgsyl's recursive vompute kernel */
 static void RELAPACK_ctgsyl_rec(
    const char *trans, const int *ifunc, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dsum, float *dscale,
    int *info
 ) {

    if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) {
        // Unblocked
        LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    // Outputs
    float scale1[] = { 1., 0. };
    float scale2[] = { 1., 0. };
    int   info1[]  = { 0 };
    int   info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        const int m1 = CREC_SPLIT(*m);
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const float *const A_TL = A;
        const float *const A_TR = A + 2 * *ldA * m1;
        const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;

        // C_T
        // C_B
        float *const C_T = C;
        float *const C_B = C + 2 * m1;

        // D_TL D_TR
        // 0    D_BR
        const float *const D_TL = D;
        const float *const D_TR = D + 2 * *ldD * m1;
        const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1;

        // F_T
        // F_B
        float *const F_T = F;
        float *const F_B = F + 2 * m1;

        if (*trans == 'N') {
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // F_T = F_T - D_TR * C_B
            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
            }
        } else {
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
            // C_B = C_B - A_TR^H * C_T
            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // C_B = C_B - D_TR^H * F_T
            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
            }
        }
    } else {
        // Splitting
        const int n1 = CREC_SPLIT(*n);
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const float *const B_TL = B;
        const float *const B_TR = B + 2 * *ldB * n1;
        const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;

        // C_L C_R
        float *const C_L = C;
        float *const C_R = C + 2 * *ldC * n1;

        // E_TL E_TR
        // 0    E_BR
        const float *const E_TL = E;
        const float *const E_TR = E + 2 * *ldE * n1;
        const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1;

        // F_L F_R
        float *const F_L = F;
        float *const F_R = F + 2 * *ldF * n1;

        if (*trans == 'N') {
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
            // C_R = C_R + F_L * B_TR
            BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
            // F_R = F_R + F_L * E_TR
            BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
            }
        } else {
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
            // F_L = F_L + C_R * B_TR
            BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
            // F_L = F_L + F_R * E_TR
            BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
            }
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/ctrsyl.c
+++ b/relapack/src/ctrsyl.c
@@ -0,0 +1,163 @@
 #include "relapack.h"

 static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
    const int *, const int *, const float *, const int *, const float *,
    const int *, float *, const int *, float *, int *);


 /** CTRSYL solves the complex Sylvester matrix equation.
 *
 * This routine is functionally equivalent to LAPACK's ctrsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html
 * */
 void RELAPACK_ctrsyl(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {

    // Check arguments
    const int notransA = LAPACK(lsame)(tranA, "N");
    const int ctransA = LAPACK(lsame)(tranA, "C");
    const int notransB = LAPACK(lsame)(tranB, "N");
    const int ctransB = LAPACK(lsame)(tranB, "C");
    *info = 0;
    if (!ctransA && !notransA)
        *info = -1;
    else if (!ctransB && !notransB)
        *info = -2;
    else if (*isgn != 1 && *isgn != -1)
        *info = -3;
    else if (*m < 0)
        *info = -4;
    else if (*n < 0)
        *info = -5;
    else if (*ldA < MAX(1, *m))
        *info = -7;
    else if (*ldB < MAX(1, *n))
        *info = -9;
    else if (*ldC < MAX(1, *m))
        *info = -11;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CTRSYL", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleantranA = notransA ? 'N' : 'C';
    const char cleantranB = notransB ? 'N' : 'C';

    // Recursive kernel
    RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }


 /** ctrsyl's recursive compute kernel */
 static void RELAPACK_ctrsyl_rec(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {

    if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) {
        // Unblocked
        RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const float MSGN[] = { -*isgn, 0. };
    const int   iONE[] = { 1 };

    // Outputs
    float scale1[] = { 1., 0. };
    float scale2[] = { 1., 0. };
    int   info1[]  = { 0 };
    int   info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        const int m1 = CREC_SPLIT(*m);
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const float *const A_TL = A;
        const float *const A_TR = A + 2 * *ldA * m1;
        const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;

        // C_T
        // C_B
        float *const C_T = C;
        float *const C_B = C + 2 * m1;

        if (*tranA == 'N') {
            // recusion(A_BR, B, C_B)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // recusion(A_TL, B, C_T)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
        } else {
            // recusion(A_TL, B, C_T)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
            // C_B = C_B - A_TR' * C_T
            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // recusion(A_BR, B, C_B)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
        }
    } else {
        // Splitting
        const int n1 = CREC_SPLIT(*n);
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const float *const B_TL = B;
        const float *const B_TR = B + 2 * *ldB * n1;
        const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;

        // C_L C_R
        float *const C_L = C;
        float *const C_R = C + 2 * *ldC * n1;

        if (*tranB == 'N') {
            // recusion(A, B_TL, C_L)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
            // C_R = C_R -/+ C_L * B_TR
            BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
            // recusion(A, B_BR, C_R)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
        } else {
            // recusion(A, B_BR, C_R)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
            // C_L = C_L -/+ C_R * B_TR'
            BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
            // recusion(A, B_TL, C_L)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/ctrsyl_rec2.c
+++ b/relapack/src/ctrsyl_rec2.c
@@ -0,0 +1,392 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "../config.h"
 #include "f2c.h"

 #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
 complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
    extern void cdotu_(complex *, int *, complex *, int *, complex *, int *);
    complex result;
    cdotu_(&result, n, x, incx, y, incy);
    return result;
 }
 #define cdotu_ cdotu_fun

 complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
    extern void cdotc_(complex *, int *, complex *, int *, complex *, int *);
    complex result;
    cdotc_(&result, n, x, incx, y, incy);
    return result;
 }
 #define cdotc_ cdotc_fun
 #endif

 #if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
 complex cladiv_fun(complex *a, complex *b) {
    extern void cladiv_(complex *, complex *, complex *);
    complex result;
    cladiv_(&result, a, b);
    return result;
 }
 #define cladiv_ cladiv_fun
 #endif

 /* Table of constant values */

 static int c__1 = 1;

 /** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
 *
 * This routine is an exact copy of LAPACK's ctrsyl.
 * It serves as an unblocked kernel in the recursive algorithms.
 * */
 /* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int
 	*isgn, int *m, int *n, complex *a, int *lda, complex *b,
 	int *ldb, complex *c__, int *ldc, float *scale, int *info,
 	ftnlen trana_len, ftnlen tranb_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4;
    float r__1, r__2;
    complex q__1, q__2, q__3, q__4;

    /* Builtin functions */
    float r_imag(complex *);
    void r_cnjg(complex *, complex *);

    /* Local variables */
    static int j, k, l;
    static complex a11;
    static float db;
    static complex x11;
    static float da11;
    static complex vec;
    static float dum[1], eps, sgn, smin;
    static complex suml, sumr;
    /* Complex */ complex cdotc_(int *, complex *, int
 	    *, complex *, int *);
    extern int lsame_(char *, char *, ftnlen, ftnlen);
    /* Complex */ complex cdotu_(int *, complex *, int
 	    *, complex *, int *);
    extern /* Subroutine */ int slabad_(float *, float *);
    extern float clange_(char *, int *, int *, complex *,
 	    int *, float *, ftnlen);
    /* Complex */ complex cladiv_(complex *, complex *);
    static float scaloc;
    extern float slamch_(char *, ftnlen);
    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
 	    *), xerbla_(char *, int *, ftnlen);
    static float bignum;
    static int notrna, notrnb;
    static float smlnum;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1;
    b -= b_offset;
    c_dim1 = *ldc;
    c_offset = 1 + c_dim1;
    c__ -= c_offset;

    /* Function Body */
    notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
    notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
    *info = 0;
    if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
 	*info = -1;
    } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
 	*info = -2;
    } else if (*isgn != 1 && *isgn != -1) {
 	*info = -3;
    } else if (*m < 0) {
 	*info = -4;
    } else if (*n < 0) {
 	*info = -5;
    } else if (*lda < max(1,*m)) {
 	*info = -7;
    } else if (*ldb < max(1,*n)) {
 	*info = -9;
    } else if (*ldc < max(1,*m)) {
 	*info = -11;
    }
    if (*info != 0) {
 	i__1 = -(*info);
 	xerbla_("CTRSY2", &i__1, (ftnlen)6);
 	return;
    }
    *scale = 1.f;
    if (*m == 0 || *n == 0) {
 	return;
    }
    eps = slamch_("P", (ftnlen)1);
    smlnum = slamch_("S", (ftnlen)1);
    bignum = 1.f / smlnum;
    slabad_(&smlnum, &bignum);
    smlnum = smlnum * (float) (*m * *n) / eps;
    bignum = 1.f / smlnum;
 /* Computing MAX */
    r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, (
 	    ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n,
 	    &b[b_offset], ldb, dum, (ftnlen)1);
    smin = dmax(r__1,r__2);
    sgn = (float) (*isgn);
    if (notrna && notrnb) {
 	i__1 = *n;
 	for (l = 1; l <= i__1; ++l) {
 	    for (k = *m; k >= 1; --k) {
 		i__2 = *m - k;
 /* Computing MIN */
 		i__3 = k + 1;
 /* Computing MIN */
 		i__4 = k + 1;
 		q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
 			min(i__4,*m) + l * c_dim1], &c__1);
 		suml.r = q__1.r, suml.i = q__1.i;
 		i__2 = l - 1;
 		q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
 			, &c__1);
 		sumr.r = q__1.r, sumr.i = q__1.i;
 		i__2 = k + l * c_dim1;
 		q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
 		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
 		q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
 		vec.r = q__1.r, vec.i = q__1.i;
 		scaloc = 1.f;
 		i__2 = k + k * a_dim1;
 		i__3 = l + l * b_dim1;
 		q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i;
 		q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i;
 		a11.r = q__1.r, a11.i = q__1.i;
 		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
 			dabs(r__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.f;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
 			r__2));
 		if (da11 < 1.f && db > 1.f) {
 		    if (db > bignum * da11) {
 			scaloc = 1.f / db;
 		    }
 		}
 		q__3.r = scaloc, q__3.i = 0.f;
 		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
 			q__3.i + vec.i * q__3.r;
 		q__1 = cladiv_(&q__2, &a11);
 		x11.r = q__1.r, x11.i = q__1.i;
 		if (scaloc != 1.f) {
 		    i__2 = *n;
 		    for (j = 1; j <= i__2; ++j) {
 			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L10: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__2 = k + l * c_dim1;
 		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
 /* L20: */
 	    }
 /* L30: */
 	}
    } else if (! notrna && notrnb) {
 	i__1 = *n;
 	for (l = 1; l <= i__1; ++l) {
 	    i__2 = *m;
 	    for (k = 1; k <= i__2; ++k) {
 		i__3 = k - 1;
 		q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
 			c_dim1 + 1], &c__1);
 		suml.r = q__1.r, suml.i = q__1.i;
 		i__3 = l - 1;
 		q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
 			, &c__1);
 		sumr.r = q__1.r, sumr.i = q__1.i;
 		i__3 = k + l * c_dim1;
 		q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
 		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
 		q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i;
 		vec.r = q__1.r, vec.i = q__1.i;
 		scaloc = 1.f;
 		r_cnjg(&q__2, &a[k + k * a_dim1]);
 		i__3 = l + l * b_dim1;
 		q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
 		q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
 		a11.r = q__1.r, a11.i = q__1.i;
 		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
 			dabs(r__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.f;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
 			r__2));
 		if (da11 < 1.f && db > 1.f) {
 		    if (db > bignum * da11) {
 			scaloc = 1.f / db;
 		    }
 		}
 		q__3.r = scaloc, q__3.i = 0.f;
 		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
 			q__3.i + vec.i * q__3.r;
 		q__1 = cladiv_(&q__2, &a11);
 		x11.r = q__1.r, x11.i = q__1.i;
 		if (scaloc != 1.f) {
 		    i__3 = *n;
 		    for (j = 1; j <= i__3; ++j) {
 			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L40: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__3 = k + l * c_dim1;
 		c__[i__3].r = x11.r, c__[i__3].i = x11.i;
 /* L50: */
 	    }
 /* L60: */
 	}
    } else if (! notrna && ! notrnb) {
 	for (l = *n; l >= 1; --l) {
 	    i__1 = *m;
 	    for (k = 1; k <= i__1; ++k) {
 		i__2 = k - 1;
 		q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
 			c_dim1 + 1], &c__1);
 		suml.r = q__1.r, suml.i = q__1.i;
 		i__2 = *n - l;
 /* Computing MIN */
 		i__3 = l + 1;
 /* Computing MIN */
 		i__4 = l + 1;
 		q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
 			l + min(i__4,*n) * b_dim1], ldb);
 		sumr.r = q__1.r, sumr.i = q__1.i;
 		i__2 = k + l * c_dim1;
 		r_cnjg(&q__4, &sumr);
 		q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
 		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
 		q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
 		vec.r = q__1.r, vec.i = q__1.i;
 		scaloc = 1.f;
 		i__2 = k + k * a_dim1;
 		i__3 = l + l * b_dim1;
 		q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
 		q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i;
 		r_cnjg(&q__1, &q__2);
 		a11.r = q__1.r, a11.i = q__1.i;
 		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
 			dabs(r__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.f;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
 			r__2));
 		if (da11 < 1.f && db > 1.f) {
 		    if (db > bignum * da11) {
 			scaloc = 1.f / db;
 		    }
 		}
 		q__3.r = scaloc, q__3.i = 0.f;
 		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
 			q__3.i + vec.i * q__3.r;
 		q__1 = cladiv_(&q__2, &a11);
 		x11.r = q__1.r, x11.i = q__1.i;
 		if (scaloc != 1.f) {
 		    i__2 = *n;
 		    for (j = 1; j <= i__2; ++j) {
 			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L70: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__2 = k + l * c_dim1;
 		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
 /* L80: */
 	    }
 /* L90: */
 	}
    } else if (notrna && ! notrnb) {
 	for (l = *n; l >= 1; --l) {
 	    for (k = *m; k >= 1; --k) {
 		i__1 = *m - k;
 /* Computing MIN */
 		i__2 = k + 1;
 /* Computing MIN */
 		i__3 = k + 1;
 		q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
 			min(i__3,*m) + l * c_dim1], &c__1);
 		suml.r = q__1.r, suml.i = q__1.i;
 		i__1 = *n - l;
 /* Computing MIN */
 		i__2 = l + 1;
 /* Computing MIN */
 		i__3 = l + 1;
 		q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
 			l + min(i__3,*n) * b_dim1], ldb);
 		sumr.r = q__1.r, sumr.i = q__1.i;
 		i__1 = k + l * c_dim1;
 		r_cnjg(&q__4, &sumr);
 		q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
 		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
 		q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i;
 		vec.r = q__1.r, vec.i = q__1.i;
 		scaloc = 1.f;
 		i__1 = k + k * a_dim1;
 		r_cnjg(&q__3, &b[l + l * b_dim1]);
 		q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i;
 		q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i;
 		a11.r = q__1.r, a11.i = q__1.i;
 		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
 			dabs(r__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.f;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
 			r__2));
 		if (da11 < 1.f && db > 1.f) {
 		    if (db > bignum * da11) {
 			scaloc = 1.f / db;
 		    }
 		}
 		q__3.r = scaloc, q__3.i = 0.f;
 		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
 			q__3.i + vec.i * q__3.r;
 		q__1 = cladiv_(&q__2, &a11);
 		x11.r = q__1.r, x11.i = q__1.i;
 		if (scaloc != 1.f) {
 		    i__1 = *n;
 		    for (j = 1; j <= i__1; ++j) {
 			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L100: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__1 = k + l * c_dim1;
 		c__[i__1].r = x11.r, c__[i__1].i = x11.i;
 /* L110: */
 	    }
 /* L120: */
 	}
    }
    return;
 }
--- a/relapack/src/ctrtri.c
+++ b/relapack/src/ctrtri.c
@@ -0,0 +1,107 @@
 #include "relapack.h"

 static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
    float *, const int *, int *);


 /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
 *
 * This routine is functionally equivalent to LAPACK's ctrtri.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html
 * */
 void RELAPACK_ctrtri(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int nounit = LAPACK(lsame)(diag, "N");
    const int unit = LAPACK(lsame)(diag, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (!nounit && !unit)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CTRTRI", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower  ? 'L' : 'U';
    const char cleandiag = nounit ? 'N' : 'U';

    // check for singularity
    if (nounit) {
        int i;
        for (i = 0; i < *n; i++)
            if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
                *info = i;
                return;
            }
    }

    // Recursive kernel
    RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
 }


 /** ctrtri's recursive compute kernel */
 static void RELAPACK_ctrtri_rec(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_CTRTRI, 1)) {
        // Unblocked
        LAPACK(ctrti2)(uplo, diag, n, A, ldA, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = - A_BL * A_TL
        BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
        // A_BL = A_BR \ A_BL
        BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TR = - A_TL * A_TR
        BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
        // A_TR = A_TR / A_BR
        BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/dgbtrf.c
+++ b/relapack/src/dgbtrf.c
@@ -0,0 +1,227 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
    const int *, double *, const int *, int *, double *, const int *, double *,
    const int *, int *);


 /** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's dgbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html
 * */
 void RELAPACK_dgbtrf(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kl < 0)
        *info = -3;
    else if (*ku < 0)
        *info = -4;
    else if (*ldAb < 2 * *kl + *ku + 1)
        *info = -6;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DGBTRF", &minfo);
        return;
    }

    // Constant
    const double ZERO[] = { 0. };

    // Result upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    double *const A = Ab + kv;

    // Zero upper diagonal fill-in elements
    int i, j;
    for (j = 0; j < *n; j++) {
        double *const A_j = A + *ldA * j;
        for (i = MAX(0, j - kv); i < j - *ku; i++)
            A_j[i] = 0.;
    }

    // Allocate work space
    const int n1 = DREC_SPLIT(*n);
    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
    const int nWorkl = (kv > n1) ? n1 : kv;
    const int mWorku = (*kl > n1) ? n1 : *kl;
    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
    double *Workl = malloc(mWorkl * nWorkl * sizeof(double));
    double *Worku = malloc(mWorku * nWorku * sizeof(double));
    LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
    LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);

    // Recursive kernel
    RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);

    // Free work space
    free(Workl);
    free(Worku);
 }


 /** dgbtrf's recursive compute kernel */
 static void RELAPACK_dgbtrf_rec(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_DGBTRF, 1)) {
        // Unblocked
        LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };
    const int    iONE[] = { 1 };

    // Loop iterators
    int i, j;

    // Output upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    double *const A = Ab + kv;

    // Splitting
    const int n1  = MIN(DREC_SPLIT(*n), *kl);
    const int n2  = *n - n1;
    const int m1  = MIN(n1, *m);
    const int m2  = *m - m1;
    const int mn1 = MIN(m1, n1);
    const int mn2 = MIN(m2, n2);

    // Ab_L *
    //      Ab_BR
    double *const Ab_L  = Ab;
    double *const Ab_BR = Ab + *ldAb * n1;

    // A_L A_R
    double *const A_L = A;
    double *const A_R = A + *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + m1;
    double *const A_BR = A + *ldA * n1 + m1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // Banded splitting
    const int n21 = MIN(n2, kv - n1);
    const int n22 = MIN(n2 - n21, n1);
    const int m21 = MIN(m2, *kl - m1);
    const int m22 = MIN(m2 - m21, m1);

    //   n1 n21  n22
    // m *  A_Rl ARr
    double *const A_Rl = A_R;
    double *const A_Rr = A_R + *ldA * n21;

    //     n1    n21    n22
    // m1  *     A_TRl  A_TRr
    // m21 A_BLt A_BRtl A_BRtr
    // m22 A_BLb A_BRbl A_BRbr
    double *const A_TRl  = A_TR;
    double *const A_TRr  = A_TR + *ldA * n21;
    double *const A_BLt  = A_BL;
    double *const A_BLb  = A_BL              + m21;
    double *const A_BRtl = A_BR;
    double *const A_BRtr = A_BR + *ldA * n21;
    double *const A_BRbl = A_BR              + m21;
    double *const A_BRbr = A_BR + *ldA * n21 + m21;

    // recursion(Ab_L, ipiv_T)
    RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);

    // Workl = A_BLb
    LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);

    // partially redo swaps in A_L
    for (i = 0; i < mn1; i++) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
            else
                BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
        }
    }

    // apply pivots to A_Rl
    LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);

    // apply pivots to A_Rr columnwise
    for (j = 0; j < n22; j++) {
        double *const A_Rrj = A_Rr + *ldA * j;
        for (i = j; i < mn1; i++) {
            const int ip = ipiv_T[i] - 1;
            if (ip != i) {
                const double tmp = A_Rrj[i];
                A_Rrj[i] = A_Rr[ip];
                A_Rrj[ip] = tmp;
            }
        }
    }

    // A_TRl = A_TL \ A_TRl
    BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
    // Worku = A_TRr
    LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
    // Worku = A_TL \ Worku
    BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
    // A_TRr = Worku
    LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
    // A_BRtl = A_BRtl - A_BLt * A_TRl
    BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
    // A_BRbl = A_BRbl - Workl * A_TRl
    BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
    // A_BRtr = A_BRtr - A_BLt * Worku
    BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
    // A_BRbr = A_BRbr - Workl * Worku
    BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);

    // partially undo swaps in A_L
    for (i = mn1 - 1; i >= 0; i--) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
            else
                BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
        }
    }

    // recursion(Ab_BR, ipiv_B)
    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
    if (*info)
        *info += n1;
    // shift pivots
    for (i = 0; i < mn2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/dgemmt.c
+++ b/relapack/src/dgemmt.c
@@ -0,0 +1,165 @@
 #include "relapack.h"

 static void RELAPACK_dgemmt_rec(const char *, const char *, const char *,
    const int *, const int *, const double *, const double *, const int *,
    const double *, const int *, const double *, double *, const int *);

 static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
    const int *, const int *, const double *, const double *, const int *,
    const double *, const int *, const double *, double *, const int *);


 /** DGEMMT computes a matrix-matrix product with general matrices but updates
 * only the upper or lower triangular part of the result matrix.
 *
 * This routine performs the same operation as the BLAS routine
 * dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
 * but only updates the triangular part of C specified by uplo:
 * If (*uplo == 'L'), only the lower triangular part of C is updated,
 * otherwise the upper triangular part is updated.
 * */
 void RELAPACK_dgemmt(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {

 #if HAVE_XGEMMT
    BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
    return;
 #else

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int notransA = LAPACK(lsame)(transA, "N");
    const int tranA = LAPACK(lsame)(transA, "T");
    const int notransB = LAPACK(lsame)(transB, "N");
    const int tranB = LAPACK(lsame)(transB, "T");
    int info = 0;
    if (!lower && !upper)
        info = 1;
    else if (!tranA && !notransA)
        info = 2;
    else if (!tranB && !notransB)
        info = 3;
    else if (*n < 0)
        info = 4;
    else if (*k < 0)
        info = 5;
    else if (*ldA < MAX(1, notransA ? *n : *k))
        info = 8;
    else if (*ldB < MAX(1, notransB ? *k : *n))
        info = 10;
    else if (*ldC < MAX(1, *n))
        info = 13;
    if (info) {
        LAPACK(xerbla)("DGEMMT", &info);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';
    const char cleantransA = notransA ? 'N' : 'T';
    const char cleantransB = notransB ? 'N' : 'T';

    // Recursive kernel
    RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
 #endif
 }


 /** dgemmt's recursive compute kernel */
 static void RELAPACK_dgemmt_rec(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {

    if (*n <= MAX(CROSSOVER_DGEMMT, 1)) {
        // Unblocked
        RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
        return;
    }

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_T
    // A_B
    const double *const A_T = A;
    const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);

    // B_L B_R
    const double *const B_L = B;
    const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);

    // C_TL C_TR
    // C_BL C_BR
    double *const C_TL = C;
    double *const C_TR = C + *ldC * n1;
    double *const C_BL = C             + n1;
    double *const C_BR = C + *ldC * n1 + n1;

    // recursion(C_TL)
    RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);

    if (*uplo == 'L')
        // C_BL = alpha A_B B_L + beta C_BL
        BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
    else
        // C_TR = alpha A_T B_R + beta C_TR
        BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);

    // recursion(C_BR)
    RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
 }


 /** dgemmt's unblocked compute kernel */
 static void RELAPACK_dgemmt_rec2(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {

    const int incB = (*transB == 'N') ? 1 : *ldB;
    const int incC = 1;

    int i;
    for (i = 0; i < *n; i++) {
        // A_0
        // A_i
        const double *const A_0 = A;
        const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i);

        // * B_i *
        const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i);

        // * C_0i *
        // * C_ii *
        double *const C_0i = C + *ldC * i;
        double *const C_ii = C + *ldC * i + i;

        if (*uplo == 'L') {
            const int nmi = *n - i;
            if (*transA == 'N')
                BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
            else
                BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
        } else {
            const int ip1 = i + 1;
            if (*transA == 'N')
                BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
            else
                BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
        }
    }
 }
--- a/relapack/src/dgetrf.c
+++ b/relapack/src/dgetrf.c
@@ -0,0 +1,117 @@
 #include "relapack.h"

 static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
    const int *, int *, int *);


 /** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's dgetrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html
 * */
 void RELAPACK_dgetrf(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DGETRF", &minfo);
        return;
    }

    const int sn = MIN(*m, *n);

    RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info);

    // Right remainder
    if (*m < *n) {
        // Constants
        const double ONE[] = { 1. };
        const int   iONE[] = { 1. };

        // Splitting
        const int rn = *n - *m;

        // A_L A_R
        const double *const A_L = A;
        double *const       A_R = A + *ldA * *m;

        // A_R = apply(ipiv, A_R)
        LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
        // A_R = A_S \ A_R
        BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
    }
 }


 /** dgetrf's recursive compute kernel */
 static void RELAPACK_dgetrf_rec(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_DGETRF, 1)) {
        // Unblocked
        LAPACK(dgetf2)(m, n, A, ldA, ipiv, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };
    const int    iONE[] = { 1 };

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;
    const int m2 = *m - n1;

    // A_L A_R
    double *const A_L = A;
    double *const A_R = A + *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // recursion(A_L, ipiv_T)
    RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
    // apply pivots to A_R
    LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);

    // A_TR = A_TL \ A_TR
    BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
    // A_BR = A_BR - A_BL * A_TR
    BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);

    // recursion(A_BR, ipiv_B)
    RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
    if (*info)
        *info += n1;
    // apply pivots to A_BL
    LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
    // shift pivots
    int i;
    for (i = 0; i < n2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/dlauum.c
+++ b/relapack/src/dlauum.c
@@ -0,0 +1,87 @@
 #include "relapack.h"

 static void RELAPACK_dlauum_rec(const char *, const int *, double *,
    const int *, int *);


 /** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
 *
 * This routine is functionally equivalent to LAPACK's dlauum.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html
 * */
 void RELAPACK_dlauum(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DLAUUM", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info);
 }


 /** dlauum's recursive compute kernel */
 static void RELAPACK_dlauum_rec(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_DLAUUM, 1)) {
        // Unblocked
        LAPACK(dlauu2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const double ONE[] = { 1. };

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info);

    if (*uplo == 'L') {
        // A_TL = A_TL + A_BL' * A_BL
        BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
        // A_BL = A_BR' * A_BL
        BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TL = A_TL + A_TR * A_TR'
        BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
        // A_TR = A_TR * A_BR'
        BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info);
 }
--- a/relapack/src/dpbtrf.c
+++ b/relapack/src/dpbtrf.c
@@ -0,0 +1,157 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
    double *, const int *, double *, const int *, int *);


 /** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
 *
 * This routine is functionally equivalent to LAPACK's dpbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html
 * */
 void RELAPACK_dpbtrf(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kd < 0)
        *info = -3;
    else if (*ldAb < *kd + 1)
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DPBTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Constant
    const double ZERO[] = { 0. };

    // Allocate work space
    const int n1 = DREC_SPLIT(*n);
    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
    double *Work = malloc(mWork * nWork * sizeof(double));
    LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);

    // Recursive kernel
    RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);

    // Free work space
    free(Work);
 }


 /** dpbtrf's recursive compute kernel */
 static void RELAPACK_dpbtrf_rec(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    double *Work, const int *ldWork,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_DPBTRF, 1)) {
        // Unblocked
        LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    double *const A = Ab + ((*uplo == 'L') ? 0 : *kd);

    // Splitting
    const int n1 = MIN(DREC_SPLIT(*n), *kd);
    const int n2 = *n - n1;

    // * *
    // * Ab_BR
    double *const Ab_BR = Ab + *ldAb * n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    // Banded splitting
    const int n21 = MIN(n2, *kd - n1);
    const int n22 = MIN(n2 - n21, n1);

    //     n1    n21    n22
    // n1  *     A_TRl  A_TRr
    // n21 A_BLt A_BRtl A_BRtr
    // n22 A_BLb A_BRbl A_BRbr
    double *const A_TRl  = A_TR;
    double *const A_TRr  = A_TR + *ldA * n21;
    double *const A_BLt  = A_BL;
    double *const A_BLb  = A_BL               + n21;
    double *const A_BRtl = A_BR;
    double *const A_BRtr = A_BR + *ldA * n21;
    double *const A_BRbl = A_BR               + n21;
    double *const A_BRbr = A_BR + *ldA * n21  + n21;

    if (*uplo == 'L') {
        // A_BLt = ABLt / A_TL'
        BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
        // A_BRtl = A_BRtl - A_BLt * A_BLt'
        BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
        // Work = A_BLb
        LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
        // Work = Work / A_TL'
        BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
        // A_BRbl = A_BRbl - Work * A_BLt'
        BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
        // A_BRbr = A_BRbr - Work * Work'
        BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_BLb = Work
        LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
    } else {
        // A_TRl = A_TL' \ A_TRl
        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
        // A_BRtl = A_BRtl - A_TRl' * A_TRl
        BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
        // Work = A_TRr
        LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
        // Work = A_TL' \ Work
        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
        // A_BRtr = A_BRtr - A_TRl' * Work
        BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
        // A_BRbr = A_BRbr - Work' * Work
        BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_TRr = Work
        LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
    }

    // recursion(A_BR)
    if (*kd > n1)
        RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info);
    else
        RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/dpotrf.c
+++ b/relapack/src/dpotrf.c
@@ -0,0 +1,92 @@
 #include "relapack.h"

 static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
        const int *, int *);


 /** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
 *
 * This routine is functionally equivalent to LAPACK's dpotrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
 * */
 void RELAPACK_dpotrf(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DPOTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info);
 }


 /** dpotrf's recursive compute kernel */
 static void RELAPACK_dpotrf_rec(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_DPOTRF, 1)) {
        // Unblocked
        LAPACK(dpotf2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = A_BL / A_TL'
        BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
        // A_BR = A_BR - A_BL * A_BL'
        BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
    } else {
        // A_TR = A_TL' \ A_TR
        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
        // A_BR = A_BR - A_TR' * A_TR
        BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/dsygst.c
+++ b/relapack/src/dsygst.c
@@ -0,0 +1,212 @@
 #include "relapack.h"
 #if XSYGST_ALLOW_MALLOC
 #include "stdlib.h"
 #endif

 static void RELAPACK_dsygst_rec(const int *, const char *, const int *,
    double *, const int *, const double *, const int *,
    double *, const int *, int *);


 /** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
 *
 * This routine is functionally equivalent to LAPACK's dsygst.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html
 * */
 void RELAPACK_dsygst(
    const int *itype, const char *uplo, const int *n,
    double *A, const int *ldA, const double *B, const int *ldB,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (*itype < 1 || *itype > 3)
        *info = -1;
    else if (!lower && !upper)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    else if (*ldB < MAX(1, *n))
        *info = -7;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DSYGST", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Allocate work space
    double *Work = NULL;
    int    lWork = 0;
 #if XSYGST_ALLOW_MALLOC
    const int n1 = DREC_SPLIT(*n);
    lWork = n1 * (*n - n1);
    Work  = malloc(lWork * sizeof(double));
    if (!Work)
        lWork = 0;
 #endif

    // recursive kernel
    RELAPACK_dsygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);

    // Free work space
 #if XSYGST_ALLOW_MALLOC
    if (Work)
        free(Work);
 #endif
 }


 /** dsygst's recursive compute kernel */
 static void RELAPACK_dsygst_rec(
    const int *itype, const char *uplo, const int *n,
    double *A, const int *ldA, const double *B, const int *ldB,
    double *Work, const int *lWork, int *info
 ) {

    if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
        // Unblocked
        LAPACK(dsygs2)(itype, uplo, n, A, ldA, B, ldB, info);
        return;
    }

    // Constants
    const double ZERO[]  = { 0. };
    const double ONE[]   = { 1. };
    const double MONE[]  = { -1. };
    const double HALF[]  = { .5 };
    const double MHALF[] = { -.5 };
    const int    iONE[]  = { 1 };

    // Loop iterator
    int i;

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // B_TL B_TR
    // B_BL B_BR
    const double *const B_TL = B;
    const double *const B_TR = B + *ldB * n1;
    const double *const B_BL = B             + n1;
    const double *const B_BR = B + *ldB * n1 + n1;

    // recursion(A_TL, B_TL)
    RELAPACK_dsygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);

    if (*itype == 1)
        if (*uplo == 'L') {
            // A_BL = A_BL / B_TL'
            BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork > n2 * n1) {
                // T = -1/2 * B_BL * A_TL
                BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
            } else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
            BLAS(dsyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
            if (*lWork > n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
            else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR \ A_BL
            BLAS(dtrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL' \ A_TR
            BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork > n2 * n1) {
                // T = -1/2 * A_TL * B_TR
                BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
            } else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
            BLAS(dsyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
            if (*lWork > n2 * n1)
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
            else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR / B_BR
            BLAS(dtrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }
    else
        if (*uplo == 'L') {
            // A_BL = A_BL * B_TL
            BLAS(dtrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork > n2 * n1) {
                // T = 1/2 * A_BR * B_BL
                BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
            } else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
            BLAS(dsyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
            if (*lWork > n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
            else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR * A_BL
            BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL * A_TR
            BLAS(dtrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork > n2 * n1) {
                // T = 1/2 * B_TR * A_BR
                BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
            } else
                // A_TR = A_TR + 1/2 B_TR A_BR
                BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
            BLAS(dsyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
            if (*lWork > n2 * n1)
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
            else
                // A_TR = A_TR + 1/2 B_TR * A_BR
                BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR * B_BR
            BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }

    // recursion(A_BR, B_BR)
    RELAPACK_dsygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
 }
--- a/relapack/src/dsytrf.c
+++ b/relapack/src/dsytrf.c
@@ -0,0 +1,238 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *,
    double *, const int *, int *, double *, const int *, int *);


 /** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's dsytrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html
 * */
 void RELAPACK_dsytrf(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    double *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * sizeof(double));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy arguments
    int nout;

    // Recursive kernel
    RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** dsytrf's recursive compute kernel */
 static void RELAPACK_dsytrf_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_DSYTRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_dsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };
    const int    iONE[] = { 1 };

    // Loop iterator
    int i;

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = DREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        double *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        double *const A_BL   = A             + n1;
        double *const A_BR   = A + *ldA * n1 + n1;
        double *const A_BL_B = A             + *n;
        double *const A_BR_B = A + *ldA * n1 + *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        double *const Work_BL =              Work                + n1;
        double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;

            // last row of A_BL
            double *const A_BL_b = A_BL + n2_out;

            // last row of Work_BL
            double *const Work_BL_b = Work_BL + n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = DREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        double *const Work_R = top ? Work : Work + *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1  = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        double *const A_TL_T = A + *ldA * n_rest;
        double *const A_TR_T = A + *ldA * (n_rest + n1);
        double *const A_TL   = A + *ldA * n_rest        + n_rest;
        double *const A_TR   = A + *ldA * (n_rest + n1) + n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        double *const Work_L  = Work;
        double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/dsytrf_rec2.c
+++ b/relapack/src/dsytrf_rec2.c
@@ -0,0 +1,352 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static int c__1 = 1;
 static double c_b8 = -1.;
 static double c_b9 = 1.;

 /** DSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
 *
 * This routine is a minor modification of LAPACK's dlasyf.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int *
 	nb, int *kb, double *a, int *lda, int *ipiv,
 	double *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
    double d__1, d__2, d__3;

    /* Builtin functions */
    double sqrt(double);

    /* Local variables */
    static int j, k;
    static double t, r1, d11, d21, d22;
    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
    static double alpha;
    extern /* Subroutine */ int dscal_(int *, double *, double *,
 	    int *);
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int dgemv_(char *, int *, int *,
 	    double *, double *, int *, double *, int *,
 	    double *, double *, int *, ftnlen), dcopy_(int *,
 	    double *, int *, double *, int *), dswap_(int
 	    *, double *, int *, double *, int *);
    static int kstep;
    static double absakk;
    extern int idamax_(int *, double *, int *);
    static double colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.) + 1.) / 8.;
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	kstep = 1;
 	absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		i__1 = k - imax;
 		jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
 			 &c__1);
 		rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    jmax = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 /* Computing MAX */
 		    d__2 = rowmax, d__3 = (d__1 = w[jmax + (kw - 1) * w_dim1],
 			     abs(d__1));
 		    rowmax = max(d__2,d__3);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else if ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) >=
 			alpha * rowmax) {
 		    kp = imax;
 		    dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		} else {
 		    kp = imax;
 		    kstep = 2;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kp != kk) {
 		a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
 		i__1 = kk - 1 - kp;
 		dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    dcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    dswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		r1 = 1. / a[k + k * a_dim1];
 		i__1 = k - 1;
 		dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 	    } else {
 		if (k > 2) {
 		    d21 = w[k - 1 + kw * w_dim1];
 		    d11 = w[k + kw * w_dim1] / d21;
 		    d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
 		    t = 1. / (d11 * d22 - 1.);
 		    d21 = t / d21;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
 				* w_dim1] - w[j + kw * w_dim1]);
 			a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
 				w[j + (kw - 1) * w_dim1]);
 /* L20: */
 		    }
 		}
 		a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
 		a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
 		a[k + k * a_dim1] = w[k + kw * w_dim1];
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    ++j;
 	}
 	++j;
 	if (jp != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    dswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
 	}
 	if (j < *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	i__1 = *n - k + 1;
 	dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	i__1 = *n - k + 1;
 	i__2 = k - 1;
 	dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
 		+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
 	kstep = 1;
 	absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - k;
 		dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		i__1 = *n - k + 1;
 		i__2 = k - 1;
 		dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
 			lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
 			w_dim1], &c__1, (ftnlen)12);
 		i__1 = imax - k;
 		jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
 			;
 		rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    jmax = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 /* Computing MAX */
 		    d__2 = rowmax, d__3 = (d__1 = w[jmax + (k + 1) * w_dim1],
 			    abs(d__1));
 		    rowmax = max(d__2,d__3);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else if ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) >=
 			alpha * rowmax) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		} else {
 		    kp = imax;
 		    kstep = 2;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kp != kk) {
 		a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
 		i__1 = kp - kk - 1;
 		dcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    dswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    r1 = 1. / a[k + k * a_dim1];
 		    i__1 = *n - k;
 		    dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    d21 = w[k + 1 + k * w_dim1];
 		    d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
 		    d22 = w[k + k * w_dim1] / d21;
 		    t = 1. / (d11 * d22 - 1.);
 		    d21 = t / d21;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
 				w[j + (k + 1) * w_dim1]);
 			a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
 				 w_dim1] - w[j + k * w_dim1]);
 /* L80: */
 		    }
 		}
 		a[k + k * a_dim1] = w[k + k * w_dim1];
 		a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
 		a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    --j;
 	}
 	--j;
 	if (jp != jj && j >= 1) {
 	    dswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j > 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/dsytrf_rook.c
+++ b/relapack/src/dsytrf_rook.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *,
    double *, const int *, int *, double *, const int *, int *);


 /** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's dsytrf_rook.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html
 * */
 void RELAPACK_dsytrf_rook(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    double *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * sizeof(double));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** dsytrf_rook's recursive compute kernel */
 static void RELAPACK_dsytrf_rook_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) {
        // Unblocked
        if (top) {
            LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_dsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };
    const int    iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = DREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        double *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2   = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        double *const A_BL   = A             + n1;
        double *const A_BR   = A + *ldA * n1 + n1;
        double *const A_BL_B = A             + *n;
        double *const A_BR_B = A + *ldA * n1 + *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        double *const Work_BL =              Work                + n1;
        double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;

            // last row of A_BL
            double *const A_BL_b = A_BL + n2_out;

            // last row of Work_BL
            double *const Work_BL_b = Work_BL + n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = DREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        double *const Work_R = top ? Work : Work + *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        double *const A_TL_T = A + *ldA * n_rest;
        double *const A_TR_T = A + *ldA * (n_rest + n1);
        double *const A_TL   = A + *ldA * n_rest        + n_rest;
        double *const A_TR   = A + *ldA * (n_rest + n1) + n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        double *const Work_L  = Work;
        double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/dsytrf_rook_rec2.c
+++ b/relapack/src/dsytrf_rook_rec2.c
@@ -0,0 +1,451 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static int c__1 = 1;
 static double c_b9 = -1.;
 static double c_b10 = 1.;

 /** DSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
 *
 * This routine is a minor modification of LAPACK's dlasyf.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n,
 	int *nb, int *kb, double *a, int *lda, int *ipiv,
 	double *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
    double d__1;

    /* Builtin functions */
    double sqrt(double);

    /* Local variables */
    static int j, k, p;
    static double t, r1, d11, d12, d21, d22;
    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
    static logical done;
    static int imax, jmax;
    static double alpha;
    extern /* Subroutine */ int dscal_(int *, double *, double *,
 	    int *);
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int dgemv_(char *, int *, int *,
 	    double *, double *, int *, double *, int *,
 	    double *, double *, int *, ftnlen);
    static double dtemp, sfmin;
    static int itemp;
    extern /* Subroutine */ int dcopy_(int *, double *, int *,
 	    double *, int *), dswap_(int *, double *, int
 	    *, double *, int *);
    static int kstep;
    extern double dlamch_(char *, ftnlen);
    static double absakk;
    extern int idamax_(int *, double *, int *);
    static double colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.) + 1.) / 8.;
    sfmin = dlamch_("S", (ftnlen)1);
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	p = k;
 	dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L12:
 		dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = k - imax;
 		    jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) *
 			    w_dim1], &c__1);
 		    rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
 		} else {
 		    rowmax = 0.;
 		}
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    itemp = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 		    dtemp = (d__1 = w[itemp + (kw - 1) * w_dim1], abs(d__1));
 		    if (dtemp > rowmax) {
 			rowmax = dtemp;
 			jmax = itemp;
 		    }
 		}
 		if (! ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) <
 			alpha * rowmax)) {
 		    kp = imax;
 		    dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		}
 		if (! done) {
 		    goto L12;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kstep == 2 && p != k) {
 		i__1 = k - p;
 		dcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
 			a_dim1], lda);
 		dcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - k + 1;
 		dswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - kk + 1;
 		dswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
 			 ldw);
 	    }
 	    if (kp != kk) {
 		a[kp + k * a_dim1] = a[kk + k * a_dim1];
 		i__1 = k - 1 - kp;
 		dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		dcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - kk + 1;
 		dswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
 			 lda);
 		i__1 = *n - kk + 1;
 		dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
 			r1 = 1. / a[k + k * a_dim1];
 			i__1 = k - 1;
 			dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    } else if (a[k + k * a_dim1] != 0.) {
 			i__1 = k - 1;
 			for (ii = 1; ii <= i__1; ++ii) {
 			    a[ii + k * a_dim1] /= a[k + k * a_dim1];
 /* L14: */
 			}
 		    }
 		}
 	    } else {
 		if (k > 2) {
 		    d12 = w[k - 1 + kw * w_dim1];
 		    d11 = w[k + kw * w_dim1] / d12;
 		    d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
 		    t = 1. / (d11 * d22 - 1.);
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
 				 w_dim1] - w[j + kw * w_dim1]) / d12);
 			a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
 				w[j + (kw - 1) * w_dim1]) / d12);
 /* L20: */
 		    }
 		}
 		a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
 		a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
 		a[k + k * a_dim1] = w[k + kw * w_dim1];
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    ++j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	++j;
 	if (jp2 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    dswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	jj = j - 1;
 	if (jp1 != jj && kstep == 2) {
 	    i__1 = *n - j + 1;
 	    dswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	if (j <= *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	p = k;
 	i__1 = *n - k + 1;
 	dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	if (k > 1) {
 	    i__1 = *n - k + 1;
 	    i__2 = k - 1;
 	    dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
 		    w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
 		    ftnlen)12);
 	}
 	absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = *n - k + 1;
 	    dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 		    c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L72:
 		i__1 = imax - k;
 		dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		if (k > 1) {
 		    i__1 = *n - k + 1;
 		    i__2 = k - 1;
 		    dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
 			    , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
 			    + 1) * w_dim1], &c__1, (ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = imax - k;
 		    jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &
 			    c__1);
 		    rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
 		} else {
 		    rowmax = 0.;
 		}
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    itemp = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 		    dtemp = (d__1 = w[itemp + (k + 1) * w_dim1], abs(d__1));
 		    if (dtemp > rowmax) {
 			rowmax = dtemp;
 			jmax = itemp;
 		    }
 		}
 		if (! ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) < alpha
 			* rowmax)) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    i__1 = *n - k + 1;
 		    dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		}
 		if (! done) {
 		    goto L72;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kstep == 2 && p != k) {
 		i__1 = p - k;
 		dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - p + 1;
 		dcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
 			c__1);
 		dswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
 		dswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
 	    }
 	    if (kp != kk) {
 		a[kp + k * a_dim1] = a[kk + k * a_dim1];
 		i__1 = kp - k - 1;
 		dcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
 			* a_dim1], lda);
 		i__1 = *n - kp + 1;
 		dcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
 			a_dim1], &c__1);
 		dswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
 			r1 = 1. / a[k + k * a_dim1];
 			i__1 = *n - k;
 			dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    } else if (a[k + k * a_dim1] != 0.) {
 			i__1 = *n;
 			for (ii = k + 1; ii <= i__1; ++ii) {
 			    a[ii + k * a_dim1] /= a[k + k * a_dim1];
 /* L74: */
 			}
 		    }
 		}
 	    } else {
 		if (k < *n - 1) {
 		    d21 = w[k + 1 + k * w_dim1];
 		    d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
 		    d22 = w[k + k * w_dim1] / d21;
 		    t = 1. / (d11 * d22 - 1.);
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
 				j + (k + 1) * w_dim1]) / d21);
 			a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
 				w_dim1] - w[j + k * w_dim1]) / d21);
 /* L80: */
 		    }
 		}
 		a[k + k * a_dim1] = w[k + k * w_dim1];
 		a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
 		a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    --j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	--j;
 	if (jp2 != jj && j >= 1) {
 	    dswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	jj = j + 1;
 	if (jp1 != jj && kstep == 2) {
 	    dswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j >= 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/dtgsyl.c
+++ b/relapack/src/dtgsyl.c
@@ -0,0 +1,274 @@
 #include "relapack.h"
 #include <math.h>

 static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *,
    const int *, const double *, const int *, const double *, const int *,
    double *, const int *, const double *, const int *, const double *,
    const int *, double *, const int *, double *, double *, double *, int *,
    int *, int *);


 /** DTGSYL solves the generalized Sylvester equation.
 *
 * This routine is functionally equivalent to LAPACK's dtgsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html
 * */
 void RELAPACK_dtgsyl(
    const char *trans, const int *ijob, const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC,
    const double *D, const int *ldD, const double *E, const int *ldE,
    double *F, const int *ldF,
    double *scale, double *dif,
    double *Work, const int *lWork, int *iWork, int *info
 ) {

    // Parse arguments
    const int notran = LAPACK(lsame)(trans, "N");
    const int tran = LAPACK(lsame)(trans, "T");

    // Compute work buffer size
    int lwmin = 1;
    if (notran && (*ijob == 1 || *ijob == 2))
        lwmin = MAX(1, 2 * *m * *n);
    *info = 0;

    // Check arguments
    if (!tran && !notran)
        *info = -1;
    else if (notran && (*ijob < 0 || *ijob > 4))
        *info = -2;
    else if (*m <= 0)
        *info = -3;
    else if (*n <= 0)
        *info = -4;
    else if (*ldA < MAX(1, *m))
        *info = -6;
    else if (*ldB < MAX(1, *n))
        *info = -8;
    else if (*ldC < MAX(1, *m))
        *info = -10;
    else if (*ldD < MAX(1, *m))
        *info = -12;
    else if (*ldE < MAX(1, *n))
        *info = -14;
    else if (*ldF < MAX(1, *m))
        *info = -16;
    else if (*lWork < lwmin && *lWork != -1)
        *info = -20;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DTGSYL", &minfo);
        return;
    }

    if (*lWork == -1) {
        // Work size query
        *Work = lwmin;
        return;
    }

    // Clean char * arguments
    const char cleantrans = notran ? 'N' : 'T';

    // Constant
    const double ZERO[] = { 0. };

    int isolve = 1;
    int ifunc  = 0;
    if (notran) {
        if (*ijob >= 3) {
            ifunc = *ijob - 2;
            LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
            LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
        } else if (*ijob >= 1)
            isolve = 2;
    }

    double scale2;
    int iround;
    for (iround = 1; iround <= isolve; iround++) {
        *scale = 1;
        double dscale = 0;
        double dsum   = 1;
        int pq;
        RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
        if (dscale != 0) {
            if (*ijob == 1 || *ijob == 3)
                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
            else
                *dif = sqrt(pq) / (dscale * sqrt(dsum));
        }
        if (isolve == 2) {
            if (iround == 1) {
                if (notran)
                    ifunc = *ijob;
                scale2 = *scale;
                LAPACK(dlacpy)("F", m, n, C, ldC, Work, m);
                LAPACK(dlacpy)("F", m, n, F, ldF, Work + *m * *n, m);
                LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
                LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
            } else {
                LAPACK(dlacpy)("F", m, n, Work, m, C, ldC);
                LAPACK(dlacpy)("F", m, n, Work + *m * *n, m, F, ldF);
                *scale = scale2;
            }
        }
    }
 }


 /** dtgsyl's recursive vompute kernel */
 static void RELAPACK_dtgsyl_rec(
    const char *trans, const int *ifunc, const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC,
    const double *D, const int *ldD, const double *E, const int *ldE,
    double *F, const int *ldF,
    double *scale, double *dsum, double *dscale,
    int *iWork, int *pq, int *info
 ) {

    if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) {
        // Unblocked
        LAPACK(dtgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };
    const int    iONE[] = { 1 };

    // Outputs
    double scale1[] = { 1. };
    double scale2[] = { 1. };
    int    info1[]  = { 0 };
    int    info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        int m1 = DREC_SPLIT(*m);
        if (A[m1 + *ldA * (m1 - 1)])
            m1++;
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const double *const A_TL = A;
        const double *const A_TR = A + *ldA * m1;
        const double *const A_BR = A + *ldA * m1 + m1;

        // C_T
        // C_B
        double *const C_T = C;
        double *const C_B = C + m1;

        // D_TL D_TR
        // 0    D_BR
        const double *const D_TL = D;
        const double *const D_TR = D + *ldD * m1;
        const double *const D_BR = D + *ldD * m1 + m1;

        // F_T
        // F_B
        double *const F_T = F;
        double *const F_B = F + m1;

        if (*trans == 'N') {
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // F_T = F_T - D_TR * C_B
            BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
            }
        } else {
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
            // C_B = C_B - A_TR^H * C_T
            BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // C_B = C_B - D_TR^H * F_T
            BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
            }
        }
    } else {
        // Splitting
        int n1 = DREC_SPLIT(*n);
        if (B[n1 + *ldB * (n1 - 1)])
            n1++;
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const double *const B_TL = B;
        const double *const B_TR = B + *ldB * n1;
        const double *const B_BR = B + *ldB * n1 + n1;

        // C_L C_R
        double *const C_L = C;
        double *const C_R = C + *ldC * n1;

        // E_TL E_TR
        // 0    E_BR
        const double *const E_TL = E;
        const double *const E_TR = E + *ldE * n1;
        const double *const E_BR = E + *ldE * n1 + n1;

        // F_L F_R
        double *const F_L = F;
        double *const F_R = F + *ldF * n1;

        if (*trans == 'N') {
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
            // C_R = C_R + F_L * B_TR
            BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
            // F_R = F_R + F_L * E_TR
            BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
            }
        } else {
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
            // F_L = F_L + C_R * B_TR
            BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
            // F_L = F_L + F_R * E_TR
            BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
            }
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/dtrsyl.c
+++ b/relapack/src/dtrsyl.c
@@ -0,0 +1,169 @@
 #include "relapack.h"

 static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *,
    const int *, const int *, const double *, const int *, const double *,
    const int *, double *, const int *, double *, int *);


 /** DTRSYL solves the real Sylvester matrix equation.
 *
 * This routine is functionally equivalent to LAPACK's dtrsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html
 * */
 void RELAPACK_dtrsyl(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC, double *scale,
    int *info
 ) {

    // Check arguments
    const int notransA = LAPACK(lsame)(tranA, "N");
    const int transA = LAPACK(lsame)(tranA, "T");
    const int ctransA = LAPACK(lsame)(tranA, "C");
    const int notransB = LAPACK(lsame)(tranB, "N");
    const int transB = LAPACK(lsame)(tranB, "T");
    const int ctransB = LAPACK(lsame)(tranB, "C");
    *info = 0;
    if (!transA && !ctransA && !notransA)
        *info = -1;
    else if (!transB && !ctransB && !notransB)
        *info = -2;
    else if (*isgn != 1 && *isgn != -1)
        *info = -3;
    else if (*m < 0)
        *info = -4;
    else if (*n < 0)
        *info = -5;
    else if (*ldA < MAX(1, *m))
        *info = -7;
    else if (*ldB < MAX(1, *n))
        *info = -9;
    else if (*ldC < MAX(1, *m))
        *info = -11;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DTRSYL", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
    const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');

    // Recursive kernel
    RELAPACK_dtrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }


 /** dtrsyl's recursive compute kernel */
 static void RELAPACK_dtrsyl_rec(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC, double *scale,
    int *info
 ) {

    if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) {
        // Unblocked
        RELAPACK_dtrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };
    const double MSGN[] = { -*isgn };
    const int    iONE[] = { 1 };

    // Outputs
    double scale1[] = { 1. };
    double scale2[] = { 1. };
    int    info1[]  = { 0 };
    int    info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        int m1 = DREC_SPLIT(*m);
        if (A[m1 + *ldA * (m1 - 1)])
            m1++;
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const double *const A_TL = A;
        const double *const A_TR = A + *ldA * m1;
        const double *const A_BR = A + *ldA * m1 + m1;

        // C_T
        // C_B
        double *const C_T = C;
        double *const C_B = C + m1;

        if (*tranA == 'N') {
            // recusion(A_BR, B, C_B)
            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // recusion(A_TL, B, C_T)
            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
        } else {
            // recusion(A_TL, B, C_T)
            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
            // C_B = C_B - A_TR' * C_T
            BLAS(dgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // recusion(A_BR, B, C_B)
            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
        }
    } else {
        // Splitting
        int n1 = DREC_SPLIT(*n);
        if (B[n1 + *ldB * (n1 - 1)])
            n1++;
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const double *const B_TL = B;
        const double *const B_TR = B + *ldB * n1;
        const double *const B_BR = B + *ldB * n1 + n1;

        // C_L C_R
        double *const C_L = C;
        double *const C_R = C + *ldC * n1;

        if (*tranB == 'N') {
            // recusion(A, B_TL, C_L)
            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
            // C_R = C_R -/+ C_L * B_TR
            BLAS(dgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
            // recusion(A, B_BR, C_R)
            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
        } else {
            // recusion(A, B_BR, C_R)
            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
            // C_L = C_L -/+ C_R * B_TR'
            BLAS(dgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
            // recusion(A, B_TL, C_L)
            RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/dtrsyl_rec2.c
+++ b/relapack/src/dtrsyl_rec2.c
--- a/relapack/src/dtrtri.c
+++ b/relapack/src/dtrtri.c
@@ -0,0 +1,107 @@
 #include "relapack.h"

 static void RELAPACK_dtrtri_rec(const char *, const char *, const int *,
    double *, const int *, int *);


 /** DTRTRI computes the inverse of a real upper or lower triangular matrix A.
 *
 * This routine is functionally equivalent to LAPACK's dtrtri.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html
 * */
 void RELAPACK_dtrtri(
    const char *uplo, const char *diag, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int nounit = LAPACK(lsame)(diag, "N");
    const int unit = LAPACK(lsame)(diag, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (!nounit && !unit)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DTRTRI", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower  ? 'L' : 'U';
    const char cleandiag = nounit ? 'N' : 'U';

    // check for singularity
    if (nounit) {
        int i;
        for (i = 0; i < *n; i++)
            if (A[i + *ldA * i] == 0) {
                *info = i;
                return;
            }
    }

    // Recursive kernel
    RELAPACK_dtrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
 }


 /** dtrtri's recursive compute kernel */
 static void RELAPACK_dtrtri_rec(
    const char *uplo, const char *diag, const int *n,
    double *A, const int *ldA,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_DTRTRI, 1)) {
        // Unblocked
        LAPACK(dtrti2)(uplo, diag, n, A, ldA, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_dtrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = - A_BL * A_TL
        BLAS(dtrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
        // A_BL = A_BR \ A_BL
        BLAS(dtrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TR = - A_TL * A_TR
        BLAS(dtrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
        // A_TR = A_TR / A_BR
        BLAS(dtrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_dtrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/f2c.c
+++ b/relapack/src/f2c.c
@@ -0,0 +1,109 @@
 #include "stdlib.h"
 #include "stdio.h"
 #include "signal.h"
 #include "f2c.h"

 #ifndef SIGIOT
 #ifdef SIGABRT
 #define SIGIOT SIGABRT
 #endif
 #endif

 void sig_die(const char *s, int kill) {
 	/* print error message, then clear buffers */
 	fprintf(stderr, "%s\n", s);

 	if(kill) {
 		fflush(stderr);
 		/* now get a core */
 		signal(SIGIOT, SIG_DFL);
 		abort();
 	} else
 		exit(1);
 }

 void c_div(complex *c, complex *a, complex *b) {
 	double ratio, den;
 	double abr, abi, cr;

 	if( (abr = b->r) < 0.)
 		abr = - abr;
 	if( (abi = b->i) < 0.)
 		abi = - abi;
 	if( abr <= abi ) {
 		if(abi == 0) {
 #ifdef IEEE_COMPLEX_DIVIDE
 			float af, bf;
 			af = bf = abr;
 			if (a->i != 0 || a->r != 0)
 				af = 1.;
 			c->i = c->r = af / bf;
 			return;
 #else
 			sig_die("complex division by zero", 1);
 #endif
 		}
 		ratio = (double)b->r / b->i ;
 		den = b->i * (1 + ratio*ratio);
 		cr = (a->r*ratio + a->i) / den;
 		c->i = (a->i*ratio - a->r) / den;
 	} else {
 		ratio = (double)b->i / b->r ;
 		den = b->r * (1 + ratio*ratio);
 		cr = (a->r + a->i*ratio) / den;
 		c->i = (a->i - a->r*ratio) / den;
    }
 	c->r = cr;
 }

 void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) {
 	double ratio, den;
 	double abr, abi, cr;

 	if( (abr = b->r) < 0.)
 		abr = - abr;
 	if( (abi = b->i) < 0.)
 		abi = - abi;
 	if( abr <= abi ) {
 		if(abi == 0) {
 #ifdef IEEE_COMPLEX_DIVIDE
 			if (a->i != 0 || a->r != 0)
 				abi = 1.;
 			c->i = c->r = abi / abr;
 			return;
 #else
 			sig_die("complex division by zero", 1);
 #endif
        }
 		ratio = b->r / b->i ;
 		den = b->i * (1 + ratio*ratio);
 		cr = (a->r*ratio + a->i) / den;
 		c->i = (a->i*ratio - a->r) / den;
 	} else {
 		ratio = b->i / b->r ;
 		den = b->r * (1 + ratio*ratio);
 		cr = (a->r + a->i*ratio) / den;
 		c->i = (a->i - a->r*ratio) / den;
    }
 	c->r = cr;
 }

 float r_imag(complex *z) {
    return z->i;
 }

 void r_cnjg(complex *r, complex *z) {
 	float zi = z->i;
 	r->r = z->r;
 	r->i = -zi;
 }

 double d_imag(doublecomplex *z) {
    return z->i;
 }

 void d_cnjg(doublecomplex *r, doublecomplex *z) {
 	double zi = z->i;
 	r->r = z->r;
 	r->i = -zi;
 }
--- a/relapack/src/f2c.h
+++ b/relapack/src/f2c.h
@@ -0,0 +1,223 @@
 /* f2c.h  --  Standard Fortran to C header file */

 /**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."

 	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */

 #ifndef F2C_INCLUDE
 #define F2C_INCLUDE

 typedef long int integer;
 typedef unsigned long int uinteger;
 typedef char *address;
 typedef short int shortint;
 typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
 typedef long int logical;
 typedef short int shortlogical;
 typedef char logical1;
 typedef char integer1;
 #ifdef INTEGER_STAR_8	/* Adjust for integer*8. */
 typedef long long longint;		/* system-dependent */
 typedef unsigned long long ulongint;	/* system-dependent */
 #define qbit_clear(a,b)	((a) & ~((ulongint)1 << (b)))
 #define qbit_set(a,b)	((a) |  ((ulongint)1 << (b)))
 #endif

 #define TRUE_ (1)
 #define FALSE_ (0)

 /* Extern is for use with -E */
 #ifndef Extern
 #define Extern extern
 #endif

 /* I/O stuff */

 #ifdef f2c_i2
 /* for -i2 */
 typedef short flag;
 typedef short ftnlen;
 typedef short ftnint;
 #else
 typedef long int flag;
 typedef long int ftnlen;
 typedef long int ftnint;
 #endif

 /*external read, write*/
 typedef struct
 {	flag cierr;
 	ftnint ciunit;
 	flag ciend;
 	char *cifmt;
 	ftnint cirec;
 } cilist;

 /*internal read, write*/
 typedef struct
 {	flag icierr;
 	char *iciunit;
 	flag iciend;
 	char *icifmt;
 	ftnint icirlen;
 	ftnint icirnum;
 } icilist;

 /*open*/
 typedef struct
 {	flag oerr;
 	ftnint ounit;
 	char *ofnm;
 	ftnlen ofnmlen;
 	char *osta;
 	char *oacc;
 	char *ofm;
 	ftnint orl;
 	char *oblnk;
 } olist;

 /*close*/
 typedef struct
 {	flag cerr;
 	ftnint cunit;
 	char *csta;
 } cllist;

 /*rewind, backspace, endfile*/
 typedef struct
 {	flag aerr;
 	ftnint aunit;
 } alist;

 /* inquire */
 typedef struct
 {	flag inerr;
 	ftnint inunit;
 	char *infile;
 	ftnlen infilen;
 	ftnint	*inex;	/*parameters in standard's order*/
 	ftnint	*inopen;
 	ftnint	*innum;
 	ftnint	*innamed;
 	char	*inname;
 	ftnlen	innamlen;
 	char	*inacc;
 	ftnlen	inacclen;
 	char	*inseq;
 	ftnlen	inseqlen;
 	char 	*indir;
 	ftnlen	indirlen;
 	char	*infmt;
 	ftnlen	infmtlen;
 	char	*inform;
 	ftnint	informlen;
 	char	*inunf;
 	ftnlen	inunflen;
 	ftnint	*inrecl;
 	ftnint	*innrec;
 	char	*inblank;
 	ftnlen	inblanklen;
 } inlist;

 #define VOID void

 union Multitype {	/* for multiple entry points */
 	integer1 g;
 	shortint h;
 	integer i;
 	/* longint j; */
 	real r;
 	doublereal d;
 	complex c;
 	doublecomplex z;
 	};

 typedef union Multitype Multitype;

 /*typedef long int Long;*/	/* No longer used; formerly in Namelist */

 struct Vardesc {	/* for Namelist */
 	char *name;
 	char *addr;
 	ftnlen *dims;
 	int  type;
 	};
 typedef struct Vardesc Vardesc;

 struct Namelist {
 	char *name;
 	Vardesc **vars;
 	int nvars;
 	};
 typedef struct Namelist Namelist;

 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (doublereal)abs(x)
 #define min(a,b) ((a) <= (b) ? (a) : (b))
 #define max(a,b) ((a) >= (b) ? (a) : (b))
 #define dmin(a,b) (doublereal)min(a,b)
 #define dmax(a,b) (doublereal)max(a,b)
 #define bit_test(a,b)	((a) >> (b) & 1)
 #define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
 #define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))

 /* procedure parameter types for -A and -C++ */

 #define F2C_proc_par_types 1
 #ifdef __cplusplus
 typedef int /* Unknown procedure type */ (*U_fp)(...);
 typedef shortint (*J_fp)(...);
 typedef integer (*I_fp)(...);
 typedef real (*R_fp)(...);
 typedef doublereal (*D_fp)(...), (*E_fp)(...);
 typedef /* Complex */ VOID (*C_fp)(...);
 typedef /* Double Complex */ VOID (*Z_fp)(...);
 typedef logical (*L_fp)(...);
 typedef shortlogical (*K_fp)(...);
 typedef /* Character */ VOID (*H_fp)(...);
 typedef /* Subroutine */ int (*S_fp)(...);
 #else
 typedef int /* Unknown procedure type */ (*U_fp)();
 typedef shortint (*J_fp)();
 typedef integer (*I_fp)();
 typedef real (*R_fp)();
 typedef doublereal (*D_fp)(), (*E_fp)();
 typedef /* Complex */ VOID (*C_fp)();
 typedef /* Double Complex */ VOID (*Z_fp)();
 typedef logical (*L_fp)();
 typedef shortlogical (*K_fp)();
 typedef /* Character */ VOID (*H_fp)();
 typedef /* Subroutine */ int (*S_fp)();
 #endif
 /* E_fp is for real functions when -R is not specified */
 typedef VOID C_f;	/* complex function */
 typedef VOID H_f;	/* character function */
 typedef VOID Z_f;	/* double complex function */
 typedef doublereal E_f;	/* real function with -R not specified */

 /* undef any lower-case symbols that your C compiler predefines, e.g.: */

 #ifndef Skip_f2c_Undefs
 #undef cray
 #undef gcos
 #undef mc68010
 #undef mc68020
 #undef mips
 #undef pdp11
 #undef sgi
 #undef sparc
 #undef sun
 #undef sun2
 #undef sun3
 #undef sun4
 #undef u370
 #undef u3b
 #undef u3b2
 #undef u3b5
 #undef unix
 #undef vax
 #endif
 #endif
--- a/relapack/src/lapack.h
+++ b/relapack/src/lapack.h
@@ -0,0 +1,80 @@
 #ifndef LAPACK_H
 #define LAPACK_H

 extern int LAPACK(lsame)(const char *, const char *);
 extern int LAPACK(xerbla)(const char *, const int *);

 extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
 extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
 extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
 extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);

 extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
 extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
 extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
 extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);

 extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
 extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
 extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
 extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);

 extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
 extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
 extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
 extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);

 extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *);
 extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *);
 extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *);
 extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *);

 extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
 extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
 extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
 extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);

 extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *);
 extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *);
 extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *);
 extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *);

 extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *);
 extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *);
 extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *);
 extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *);

 extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *);
 extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
 extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *);
 extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *);

 extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *);
 extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *);
 extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *);
 extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
 extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
 extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *);

 extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *);
 extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *);

 extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
 extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
 extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);

 extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *);
 extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *);
 extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *);
 extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *);

 #endif /* LAPACK_H */
--- a/relapack/src/lapack_wrappers.c
+++ b/relapack/src/lapack_wrappers.c
@@ -0,0 +1,607 @@
 #include "relapack.h"

 ////////////
 // XLAUUM //
 ////////////

 #if INCLUDE_SLAUUM
 void LAPACK(slauum)(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_slauum(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_DLAUUM
 void LAPACK(dlauum)(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_dlauum(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_CLAUUM
 void LAPACK(clauum)(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_clauum(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_ZLAUUM
 void LAPACK(zlauum)(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_zlauum(uplo, n, A, ldA, info);
 }
 #endif


 ////////////
 // XSYGST //
 ////////////

 #if INCLUDE_SSYGST
 void LAPACK(ssygst)(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    int *info
 ) {
    RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
 }
 #endif

 #if INCLUDE_DSYGST
 void LAPACK(dsygst)(
    const int *itype, const char *uplo, const int *n,
    double *A, const int *ldA, const double *B, const int *ldB,
    int *info
 ) {
    RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
 }
 #endif

 #if INCLUDE_CHEGST
 void LAPACK(chegst)(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    int *info
 ) {
    RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info);
 }
 #endif

 #if INCLUDE_ZHEGST
 void LAPACK(zhegst)(
    const int *itype, const char *uplo, const int *n,
    double *A, const int *ldA, const double *B, const int *ldB,
    int *info
 ) {
    RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info);
 }
 #endif


 ////////////
 // XTRTRI //
 ////////////

 #if INCLUDE_STRTRI
 void LAPACK(strtri)(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_strtri(uplo, diag, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_DTRTRI
 void LAPACK(dtrtri)(
    const char *uplo, const char *diag, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_CTRTRI
 void LAPACK(ctrtri)(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_ZTRTRI
 void LAPACK(ztrtri)(
    const char *uplo, const char *diag, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
 }
 #endif


 ////////////
 // XPOTRF //
 ////////////

 #if INCLUDE_SPOTRF
 void LAPACK(spotrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_spotrf(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_DPOTRF
 void LAPACK(dpotrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_dpotrf(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_CPOTRF
 void LAPACK(cpotrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_cpotrf(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_ZPOTRF
 void LAPACK(zpotrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_zpotrf(uplo, n, A, ldA, info);
 }
 #endif


 ////////////
 // XPBTRF //
 ////////////

 #if INCLUDE_SPBTRF
 void LAPACK(spbtrf)(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    int *info
 ) {
    RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
 }
 #endif

 #if INCLUDE_DPBTRF
 void LAPACK(dpbtrf)(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    int *info
 ) {
    RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
 }
 #endif

 #if INCLUDE_CPBTRF
 void LAPACK(cpbtrf)(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    int *info
 ) {
    RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
 }
 #endif

 #if INCLUDE_ZPBTRF
 void LAPACK(zpbtrf)(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    int *info
 ) {
    RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
 }
 #endif


 ////////////
 // XSYTRF //
 ////////////

 #if INCLUDE_SSYTRF
 void LAPACK(ssytrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_DSYTRF
 void LAPACK(dsytrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_CSYTRF
 void LAPACK(csytrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_ZSYTRF
 void LAPACK(zsytrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_CHETRF
 void LAPACK(chetrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_ZHETRF
 void LAPACK(zhetrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_SSYTRF_ROOK
 void LAPACK(ssytrf_rook)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_DSYTRF_ROOK
 void LAPACK(dsytrf_rook)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_CSYTRF_ROOK
 void LAPACK(csytrf_rook)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_ZSYTRF_ROOK
 void LAPACK(zsytrf_rook)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_CHETRF_ROOK
 void LAPACK(chetrf_rook)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_ZHETRF_ROOK
 void LAPACK(zhetrf_rook)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif


 ////////////
 // XGETRF //
 ////////////

 #if INCLUDE_SGETRF
 void LAPACK(sgetrf)(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {
    RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
 }
 #endif

 #if INCLUDE_DGETRF
 void LAPACK(dgetrf)(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {
    RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
 }
 #endif

 #if INCLUDE_CGETRF
 void LAPACK(cgetrf)(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {
    RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
 }
 #endif

 #if INCLUDE_ZGETRF
 void LAPACK(zgetrf)(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {
    RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
 }
 #endif


 ////////////
 // XGBTRF //
 ////////////

 #if INCLUDE_SGBTRF
 void LAPACK(sgbtrf)(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {
    RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
 #endif

 #if INCLUDE_DGBTRF
 void LAPACK(dgbtrf)(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {
    RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
 #endif

 #if INCLUDE_CGBTRF
 void LAPACK(cgbtrf)(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {
    RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
 #endif

 #if INCLUDE_ZGBTRF
 void LAPACK(zgbtrf)(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {
    RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
 #endif


 ////////////
 // XTRSYL //
 ////////////

 #if INCLUDE_STRSYL
 void LAPACK(strsyl)(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {
    RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
 #endif

 #if INCLUDE_DTRSYL
 void LAPACK(dtrsyl)(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC, double *scale,
    int *info
 ) {
    RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
 #endif

 #if INCLUDE_CTRSYL
 void LAPACK(ctrsyl)(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {
    RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
 #endif

 #if INCLUDE_ZTRSYL
 void LAPACK(ztrsyl)(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC, double *scale,
    int *info
 ) {
    RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
 #endif


 ////////////
 // XTGSYL //
 ////////////

 #if INCLUDE_STGSYL
 void LAPACK(stgsyl)(
    const char *trans, const int *ijob, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dif,
    float *Work, const int *lWork, int *iWork, int *info
 ) {
    RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
 #endif

 #if INCLUDE_DTGSYL
 void LAPACK(dtgsyl)(
    const char *trans, const int *ijob, const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC,
    const double *D, const int *ldD, const double *E, const int *ldE,
    double *F, const int *ldF,
    double *scale, double *dif,
    double *Work, const int *lWork, int *iWork, int *info
 ) {
    RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
 #endif

 #if INCLUDE_CTGSYL
 void LAPACK(ctgsyl)(
    const char *trans, const int *ijob, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dif,
    float *Work, const int *lWork, int *iWork, int *info
 ) {
    RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
 #endif

 #if INCLUDE_ZTGSYL
 void LAPACK(ztgsyl)(
    const char *trans, const int *ijob, const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC,
    const double *D, const int *ldD, const double *E, const int *ldE,
    double *F, const int *ldF,
    double *scale, double *dif,
    double *Work, const int *lWork, int *iWork, int *info
 ) {
    RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
 #endif


 ////////////
 // XGEMMT //
 ////////////

 #if INCLUDE_SGEMMT
 void LAPACK(sgemmt)(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {
    RELAPACK_sgemmt(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_DGEMMT
 void LAPACK(dgemmt)(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {
    RELAPACK_dgemmt(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_CGEMMT
 void LAPACK(cgemmt)(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {
    RELAPACK_cgemmt(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_ZGEMMT
 void LAPACK(zgemmt)(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {
    RELAPACK_zgemmt(uplo, n, A, ldA, info);
 }
 #endif
--- a/relapack/src/lapack_wrappers.c.orig
+++ b/relapack/src/lapack_wrappers.c.orig
@@ -0,0 +1,607 @@
 #include "relapack.h"

 ////////////
 // XLAUUM //
 ////////////

 #if INCLUDE_SLAUUM
 void LAPACK(slauum)(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_slauum(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_DLAUUM
 void LAPACK(dlauum)(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_dlauum(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_CLAUUM
 void LAPACK(clauum)(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_clauum(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_ZLAUUM
 void LAPACK(zlauum)(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_zlauum(uplo, n, A, ldA, info);
 }
 #endif


 ////////////
 // XSYGST //
 ////////////

 #if INCLUDE_SSYGST
 void LAPACK(ssygst)(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    int *info
 ) {
    RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
 }
 #endif

 #if INCLUDE_DSYGST
 void LAPACK(dsygst)(
    const int *itype, const char *uplo, const int *n,
    double *A, const int *ldA, const double *B, const int *ldB,
    int *info
 ) {
    RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
 }
 #endif

 #if INCLUDE_CSYGST
 void LAPACK(csygst)(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    int *info
 ) {
    RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info);
 }
 #endif

 #if INCLUDE_ZSYGST
 void LAPACK(zsygst)(
    const int *itype, const char *uplo, const int *n,
    double *A, const int *ldA, const double *B, const int *ldB,
    int *info
 ) {
    RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info);
 }
 #endif


 ////////////
 // XTRTRI //
 ////////////

 #if INCLUDE_STRTRI
 void LAPACK(strtri)(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_strtri(uplo, diag, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_DTRTRI
 void LAPACK(dtrtri)(
    const char *uplo, const char *diag, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_CTRTRI
 void LAPACK(ctrtri)(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_ZTRTRI
 void LAPACK(ztrtri)(
    const char *uplo, const char *diag, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
 }
 #endif


 ////////////
 // XPOTRF //
 ////////////

 #if INCLUDE_SPOTRF
 void LAPACK(spotrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_spotrf(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_DPOTRF
 void LAPACK(dpotrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_dpotrf(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_CPOTRF
 void LAPACK(cpotrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {
    RELAPACK_cpotrf(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_ZPOTRF
 void LAPACK(zpotrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {
    RELAPACK_zpotrf(uplo, n, A, ldA, info);
 }
 #endif


 ////////////
 // XPBTRF //
 ////////////

 #if INCLUDE_SPBTRF
 void LAPACK(spbtrf)(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    int *info
 ) {
    RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
 }
 #endif

 #if INCLUDE_DPBTRF
 void LAPACK(dpbtrf)(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    int *info
 ) {
    RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
 }
 #endif

 #if INCLUDE_CPBTRF
 void LAPACK(cpbtrf)(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    int *info
 ) {
    RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
 }
 #endif

 #if INCLUDE_ZPBTRF
 void LAPACK(zpbtrf)(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    int *info
 ) {
    RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
 }
 #endif


 ////////////
 // XSYTRF //
 ////////////

 #if INCLUDE_SSYTRF
 void LAPACK(ssytrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_DSYTRF
 void LAPACK(dsytrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_CSYTRF
 void LAPACK(csytrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_ZSYTRF
 void LAPACK(zsytrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_CHETRF
 void LAPACK(chetrf)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_ZHETRF
 void LAPACK(zhetrf)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_SSYTRF_ROOK
 void LAPACK(ssytrf_rook)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_DSYTRF_ROOK
 void LAPACK(dsytrf_rook)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_CSYTRF_ROOK
 void LAPACK(csytrf_rook)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_ZSYTRF_ROOK
 void LAPACK(zsytrf_rook)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_CHETRF_ROOK
 void LAPACK(chetrf_rook)(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {
    RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif

 #if INCLUDE_ZHETRF_ROOK
 void LAPACK(zhetrf_rook)(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {
    RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
 }
 #endif


 ////////////
 // XGETRF //
 ////////////

 #if INCLUDE_SGETRF
 void LAPACK(sgetrf)(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {
    RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
 }
 #endif

 #if INCLUDE_DGETRF
 void LAPACK(dgetrf)(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {
    RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
 }
 #endif

 #if INCLUDE_CGETRF
 void LAPACK(cgetrf)(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {
    RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
 }
 #endif

 #if INCLUDE_ZGETRF
 void LAPACK(zgetrf)(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {
    RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
 }
 #endif


 ////////////
 // XGBTRF //
 ////////////

 #if INCLUDE_SGBTRF
 void LAPACK(sgbtrf)(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {
    RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
 #endif

 #if INCLUDE_DGBTRF
 void LAPACK(dgbtrf)(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {
    RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
 #endif

 #if INCLUDE_CGBTRF
 void LAPACK(cgbtrf)(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {
    RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
 #endif

 #if INCLUDE_ZGBTRF
 void LAPACK(zgbtrf)(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {
    RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
 }
 #endif


 ////////////
 // XTRSYL //
 ////////////

 #if INCLUDE_STRSYL
 void LAPACK(strsyl)(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {
    RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
 #endif

 #if INCLUDE_DTRSYL
 void LAPACK(dtrsyl)(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC, double *scale,
    int *info
 ) {
    RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
 #endif

 #if INCLUDE_CTRSYL
 void LAPACK(ctrsyl)(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {
    RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
 #endif

 #if INCLUDE_ZTRSYL
 void LAPACK(ztrsyl)(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC, double *scale,
    int *info
 ) {
    RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }
 #endif


 ////////////
 // XTGSYL //
 ////////////

 #if INCLUDE_STGSYL
 void LAPACK(stgsyl)(
    const char *trans, const int *ijob, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dif,
    float *Work, const int *lWork, int *iWork, int *info
 ) {
    RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
 #endif

 #if INCLUDE_DTGSYL
 void LAPACK(dtgsyl)(
    const char *trans, const int *ijob, const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC,
    const double *D, const int *ldD, const double *E, const int *ldE,
    double *F, const int *ldF,
    double *scale, double *dif,
    double *Work, const int *lWork, int *iWork, int *info
 ) {
    RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
 #endif

 #if INCLUDE_CTGSYL
 void LAPACK(ctgsyl)(
    const char *trans, const int *ijob, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dif,
    float *Work, const int *lWork, int *iWork, int *info
 ) {
    RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
 #endif

 #if INCLUDE_ZTGSYL
 void LAPACK(ztgsyl)(
    const char *trans, const int *ijob, const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC,
    const double *D, const int *ldD, const double *E, const int *ldE,
    double *F, const int *ldF,
    double *scale, double *dif,
    double *Work, const int *lWork, int *iWork, int *info
 ) {
    RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
 }
 #endif


 ////////////
 // XGEMMT //
 ////////////

 #if INCLUDE_SGEMMT
 void LAPACK(sgemmt)(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {
    RELAPACK_sgemmt(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_DGEMMT
 void LAPACK(dgemmt)(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {
    RELAPACK_dgemmt(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_CGEMMT
 void LAPACK(cgemmt)(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {
    RELAPACK_cgemmt(uplo, n, A, ldA, info);
 }
 #endif

 #if INCLUDE_ZGEMMT
 void LAPACK(zgemmt)(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {
    RELAPACK_zgemmt(uplo, n, A, ldA, info);
 }
 #endif
--- a/relapack/src/relapack.h
+++ b/relapack/src/relapack.h
@@ -0,0 +1,60 @@
 #ifndef RELAPACK_INT_H
 #define RELAPACK_INT_H

 #include "../config.h"

 #include "../inc/relapack.h"

 // add an underscore to BLAS routines (or not)
 #if BLAS_UNDERSCORE
 #define BLAS(routine) routine ## _
 #else
 #define BLAS(routine) routine
 #endif

 // add an underscore to LAPACK routines (or not)
 #if LAPACK_UNDERSCORE
 #define LAPACK(routine) routine ## _
 #else
 #define LAPACK(routine) routine
 #endif

 // minimum and maximum macros
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))

 // REC_SPLIT(n) returns how a problem of size n is split recursively.
 // If n >= 16, we ensure that the size of at least one of the halves is
 // divisible by 8 (the cache line size in most CPUs), while both halves are
 // still as close as possible in size.
 // If n < 16 the problem is simply split in the middle. (Note that the
 // crossoversize is usually larger than 16.)
 #define SREC_SPLIT(n) ((n >= 32) ? ((n + 16) / 32) * 16 : n / 2)
 #define DREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
 #define CREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
 #define ZREC_SPLIT(n) ((n >= 8) ? ((n + 4) / 8) * 4 : n / 2)

 #include "lapack.h"
 #include "blas.h"

 // sytrf helper routines
 void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);

 // trsyl helper routines
 void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
 void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
 void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
 void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);

 #endif /*  RELAPACK_INT_H */
--- a/relapack/src/sgbtrf.c
+++ b/relapack/src/sgbtrf.c
@@ -0,0 +1,227 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *,
    const int *, float *, const int *, int *, float *, const int *, float *,
    const int *, int *);


 /** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's sgbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html
 * */
 void RELAPACK_sgbtrf(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kl < 0)
        *info = -3;
    else if (*ku < 0)
        *info = -4;
    else if (*ldAb < 2 * *kl + *ku + 1)
        *info = -6;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("SGBTRF", &minfo);
        return;
    }

    // Constant
    const float ZERO[] = { 0. };

    // Result upper band width
    const int kv = *ku + *kl;

    // Unskewg A
    const int ldA[] = { *ldAb - 1 };
    float *const A = Ab + kv;

    // Zero upper diagonal fill-in elements
    int i, j;
    for (j = 0; j < *n; j++) {
        float *const A_j = A + *ldA * j;
        for (i = MAX(0, j - kv); i < j - *ku; i++)
            A_j[i] = 0.;
    }

    // Allocate work space
    const int n1 = SREC_SPLIT(*n);
    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
    const int nWorkl = (kv > n1) ? n1 : kv;
    const int mWorku = (*kl > n1) ? n1 : *kl;
    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
    float *Workl = malloc(mWorkl * nWorkl * sizeof(float));
    float *Worku = malloc(mWorku * nWorku * sizeof(float));
    LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
    LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);

    // Recursive kernel
    RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);

    // Free work space
    free(Workl);
    free(Worku);
 }


 /** sgbtrf's recursive compute kernel */
 static void RELAPACK_sgbtrf_rec(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_SGBTRF, 1)) {
        // Unblocked
        LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1. };
    const float MONE[] = { -1. };
    const int    iONE[] = { 1 };

    // Loop iterators
    int i, j;

    // Output upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    float *const A = Ab + kv;

    // Splitting
    const int n1  = MIN(SREC_SPLIT(*n), *kl);
    const int n2  = *n - n1;
    const int m1  = MIN(n1, *m);
    const int m2  = *m - m1;
    const int mn1 = MIN(m1, n1);
    const int mn2 = MIN(m2, n2);

    // Ab_L *
    //      Ab_BR
    float *const Ab_L  = Ab;
    float *const Ab_BR = Ab + *ldAb * n1;

    // A_L A_R
    float *const A_L = A;
    float *const A_R = A + *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + *ldA * n1;
    float *const A_BL = A             + m1;
    float *const A_BR = A + *ldA * n1 + m1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // Banded splitting
    const int n21 = MIN(n2, kv - n1);
    const int n22 = MIN(n2 - n21, n1);
    const int m21 = MIN(m2, *kl - m1);
    const int m22 = MIN(m2 - m21, m1);

    //   n1 n21  n22
    // m *  A_Rl ARr
    float *const A_Rl = A_R;
    float *const A_Rr = A_R + *ldA * n21;

    //     n1    n21    n22
    // m1  *     A_TRl  A_TRr
    // m21 A_BLt A_BRtl A_BRtr
    // m22 A_BLb A_BRbl A_BRbr
    float *const A_TRl  = A_TR;
    float *const A_TRr  = A_TR + *ldA * n21;
    float *const A_BLt  = A_BL;
    float *const A_BLb  = A_BL              + m21;
    float *const A_BRtl = A_BR;
    float *const A_BRtr = A_BR + *ldA * n21;
    float *const A_BRbl = A_BR              + m21;
    float *const A_BRbr = A_BR + *ldA * n21 + m21;

    // recursion(Ab_L, ipiv_T)
    RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);

    // Workl = A_BLb
    LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);

    // partially redo swaps in A_L
    for (i = 0; i < mn1; i++) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
            else
                BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
        }
    }

    // apply pivots to A_Rl
    LAPACK(slaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);

    // apply pivots to A_Rr columnwise
    for (j = 0; j < n22; j++) {
        float *const A_Rrj = A_Rr + *ldA * j;
        for (i = j; i < mn1; i++) {
            const int ip = ipiv_T[i] - 1;
            if (ip != i) {
                const float tmp = A_Rrj[i];
                A_Rrj[i] = A_Rr[ip];
                A_Rrj[ip] = tmp;
            }
        }
    }

    // A_TRl = A_TL \ A_TRl
    BLAS(strsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
    // Worku = A_TRr
    LAPACK(slacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
    // Worku = A_TL \ Worku
    BLAS(strsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
    // A_TRr = Worku
    LAPACK(slacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
    // A_BRtl = A_BRtl - A_BLt * A_TRl
    BLAS(sgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
    // A_BRbl = A_BRbl - Workl * A_TRl
    BLAS(sgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
    // A_BRtr = A_BRtr - A_BLt * Worku
    BLAS(sgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
    // A_BRbr = A_BRbr - Workl * Worku
    BLAS(sgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);

    // partially undo swaps in A_L
    for (i = mn1 - 1; i >= 0; i--) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
            else
                BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
        }
    }

    // recursion(Ab_BR, ipiv_B)
    RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
    if (*info)
        *info += n1;
    // shift pivots
    for (i = 0; i < mn2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/sgemmt.c
+++ b/relapack/src/sgemmt.c
@@ -0,0 +1,165 @@
 #include "relapack.h"

 static void RELAPACK_sgemmt_rec(const char *, const char *, const char *,
    const int *, const int *, const float *, const float *, const int *,
    const float *, const int *, const float *, float *, const int *);

 static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *,
    const int *, const int *, const float *, const float *, const int *,
    const float *, const int *, const float *, float *, const int *);


 /** SGEMMT computes a matrix-matrix product with general matrices but updates
 * only the upper or lower triangular part of the result matrix.
 *
 * This routine performs the same operation as the BLAS routine
 * sgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
 * but only updates the triangular part of C specified by uplo:
 * If (*uplo == 'L'), only the lower triangular part of C is updated,
 * otherwise the upper triangular part is updated.
 * */
 void RELAPACK_sgemmt(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {

 #if HAVE_XGEMMT
    BLAS(sgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
    return;
 #else

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int notransA = LAPACK(lsame)(transA, "N");
    const int tranA = LAPACK(lsame)(transA, "T");
    const int notransB = LAPACK(lsame)(transB, "N");
    const int tranB = LAPACK(lsame)(transB, "T");
    int info = 0;
    if (!lower && !upper)
        info = 1;
    else if (!tranA && !notransA)
        info = 2;
    else if (!tranB && !notransB)
        info = 3;
    else if (*n < 0)
        info = 4;
    else if (*k < 0)
        info = 5;
    else if (*ldA < MAX(1, notransA ? *n : *k))
        info = 8;
    else if (*ldB < MAX(1, notransB ? *k : *n))
        info = 10;
    else if (*ldC < MAX(1, *n))
        info = 13;
    if (info) {
        LAPACK(xerbla)("SGEMMT", &info);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';
    const char cleantransA = notransA ? 'N' : 'T';
    const char cleantransB = notransB ? 'N' : 'T';

    // Recursive kernel
    RELAPACK_sgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
 #endif
 }


 /** sgemmt's recursive compute kernel */
 static void RELAPACK_sgemmt_rec(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {

    if (*n <= MAX(CROSSOVER_SGEMMT, 1)) {
        // Unblocked
        RELAPACK_sgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
        return;
    }

    // Splitting
    const int n1 = SREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_T
    // A_B
    const float *const A_T = A;
    const float *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);

    // B_L B_R
    const float *const B_L = B;
    const float *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);

    // C_TL C_TR
    // C_BL C_BR
    float *const C_TL = C;
    float *const C_TR = C + *ldC * n1;
    float *const C_BL = C             + n1;
    float *const C_BR = C + *ldC * n1 + n1;

    // recursion(C_TL)
    RELAPACK_sgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);

    if (*uplo == 'L')
        // C_BL = alpha A_B B_L + beta C_BL
        BLAS(sgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
    else
        // C_TR = alpha A_T B_R + beta C_TR
        BLAS(sgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);

    // recursion(C_BR)
    RELAPACK_sgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
 }


 /** sgemmt's unblocked compute kernel */
 static void RELAPACK_sgemmt_rec2(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {

    const int incB = (*transB == 'N') ? 1 : *ldB;
    const int incC = 1;

    int i;
    for (i = 0; i < *n; i++) {
        // A_0
        // A_i
        const float *const A_0 = A;
        const float *const A_i = A + ((*transA == 'N') ? i : *ldA * i);

        // * B_i *
        const float *const B_i = B + ((*transB == 'N') ? *ldB * i : i);

        // * C_0i *
        // * C_ii *
        float *const C_0i = C + *ldC * i;
        float *const C_ii = C + *ldC * i + i;

        if (*uplo == 'L') {
            const int nmi = *n - i;
            if (*transA == 'N')
                BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
            else
                BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
        } else {
            const int ip1 = i + 1;
            if (*transA == 'N')
                BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
            else
                BLAS(sgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
        }
    }
 }
--- a/relapack/src/sgetrf.c
+++ b/relapack/src/sgetrf.c
@@ -0,0 +1,117 @@
 #include "relapack.h"

 static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *,
    int *, int *);


 /** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's sgetrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html
 * */
 void RELAPACK_sgetrf(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("SGETRF", &minfo);
        return;
    }

    const int sn = MIN(*m, *n);

    RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info);

    // Right remainder
    if (*m < *n) {
        // Constants
        const float ONE[] = { 1. };
        const int  iONE[] = { 1. };

        // Splitting
        const int rn = *n - *m;

        // A_L A_R
        const float *const A_L = A;
        float *const       A_R = A + *ldA * *m;

        // A_R = apply(ipiv, A_R)
        LAPACK(slaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
        // A_R = A_L \ A_R
        BLAS(strsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
    }
 }


 /** sgetrf's recursive compute kernel */
 static void RELAPACK_sgetrf_rec(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_SGETRF, 1)) {
        // Unblocked
        LAPACK(sgetf2)(m, n, A, ldA, ipiv, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1. };
    const float MONE[] = { -1. };
    const int   iONE[] = { 1 };

    // Splitting
    const int n1 = SREC_SPLIT(*n);
    const int n2 = *n - n1;
    const int m2 = *m - n1;

    // A_L A_R
    float *const A_L = A;
    float *const A_R = A + *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + *ldA * n1;
    float *const A_BL = A             + n1;
    float *const A_BR = A + *ldA * n1 + n1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // recursion(A_L, ipiv_T)
    RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
    // apply pivots to A_R
    LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);

    // A_TR = A_TL \ A_TR
    BLAS(strsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
    // A_BR = A_BR - A_BL * A_TR
    BLAS(sgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);

    // recursion(A_BR, ipiv_B)
    RELAPACK_sgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
    if (*info)
        *info += n1;
    // apply pivots to A_BL
    LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
    // shift pivots
    int i;
    for (i = 0; i < n2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/slauum.c
+++ b/relapack/src/slauum.c
@@ -0,0 +1,87 @@
 #include "relapack.h"

 static void RELAPACK_slauum_rec(const char *, const int *, float *,
    const int *, int *);


 /** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
 *
 * This routine is functionally equivalent to LAPACK's slauum.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html
 * */
 void RELAPACK_slauum(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("SLAUUM", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_slauum_rec(&cleanuplo, n, A, ldA, info);
 }


 /** slauum's recursive compute kernel */
 static void RELAPACK_slauum_rec(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_SLAUUM, 1)) {
        // Unblocked
        LAPACK(slauu2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const float ONE[] = { 1. };

    // Splitting
    const int n1 = SREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + *ldA * n1;
    float *const A_BL = A             + n1;
    float *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_slauum_rec(uplo, &n1, A_TL, ldA, info);

    if (*uplo == 'L') {
        // A_TL = A_TL + A_BL' * A_BL
        BLAS(ssyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
        // A_BL = A_BR' * A_BL
        BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TL = A_TL + A_TR * A_TR'
        BLAS(ssyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
        // A_TR = A_TR * A_BR'
        BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_slauum_rec(uplo, &n2, A_BR, ldA, info);
 }
--- a/relapack/src/spbtrf.c
+++ b/relapack/src/spbtrf.c
@@ -0,0 +1,157 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_spbtrf_rec(const char *, const int *, const int *,
    float *, const int *, float *, const int *, int *);


 /** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
 *
 * This routine is functionally equivalent to LAPACK's spbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html
 * */
 void RELAPACK_spbtrf(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kd < 0)
        *info = -3;
    else if (*ldAb < *kd + 1)
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("SPBTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Constant
    const float ZERO[] = { 0. };

    // Allocate work space
    const int n1 = SREC_SPLIT(*n);
    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
    float *Work = malloc(mWork * nWork * sizeof(float));
    LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);

    // Recursive kernel
    RELAPACK_spbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);

    // Free work space
    free(Work);
 }


 /** spbtrf's recursive compute kernel */
 static void RELAPACK_spbtrf_rec(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    float *Work, const int *ldWork,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_SPBTRF, 1)) {
        // Unblocked
        LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1. };
    const float MONE[] = { -1. };

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    float *const A = Ab + ((*uplo == 'L') ? 0 : *kd);

    // Splitting
    const int n1 = MIN(SREC_SPLIT(*n), *kd);
    const int n2 = *n - n1;

    // * *
    // * Ab_BR
    float *const Ab_BR = Ab + *ldAb * n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + *ldA * n1;
    float *const A_BL = A             + n1;
    float *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_spotrf(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    // Banded splitting
    const int n21 = MIN(n2, *kd - n1);
    const int n22 = MIN(n2 - n21, *kd);

    //     n1    n21    n22
    // n1  *     A_TRl  A_TRr
    // n21 A_BLt A_BRtl A_BRtr
    // n22 A_BLb A_BRbl A_BRbr
    float *const A_TRl  = A_TR;
    float *const A_TRr  = A_TR + *ldA * n21;
    float *const A_BLt  = A_BL;
    float *const A_BLb  = A_BL               + n21;
    float *const A_BRtl = A_BR;
    float *const A_BRtr = A_BR + *ldA * n21;
    float *const A_BRbl = A_BR               + n21;
    float *const A_BRbr = A_BR + *ldA * n21  + n21;

    if (*uplo == 'L') {
        // A_BLt = ABLt / A_TL'
        BLAS(strsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
        // A_BRtl = A_BRtl - A_BLt * A_BLt'
        BLAS(ssyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
        // Work = A_BLb
        LAPACK(slacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
        // Work = Work / A_TL'
        BLAS(strsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
        // A_BRbl = A_BRbl - Work * A_BLt'
        BLAS(sgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
        // A_BRbr = A_BRbr - Work * Work'
        BLAS(ssyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_BLb = Work
        LAPACK(slacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
    } else {
        // A_TRl = A_TL' \ A_TRl
        BLAS(strsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
        // A_BRtl = A_BRtl - A_TRl' * A_TRl
        BLAS(ssyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
        // Work = A_TRr
        LAPACK(slacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
        // Work = A_TL' \ Work
        BLAS(strsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
        // A_BRtr = A_BRtr - A_TRl' * Work
        BLAS(sgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
        // A_BRbr = A_BRbr - Work' * Work
        BLAS(ssyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_TRr = Work
        LAPACK(slacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
    }

    // recursion(A_BR)
    if (*kd > n1)
        RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info);
    else
        RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/spotrf.c
+++ b/relapack/src/spotrf.c
@@ -0,0 +1,92 @@
 #include "relapack.h"

 static void RELAPACK_spotrf_rec(const char *, const int *, float *, 
        const int *, int *);


 /** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
 *
 * This routine is functionally equivalent to LAPACK's spotrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html
 * */
 void RELAPACK_spotrf(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("SPOTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_spotrf_rec(&cleanuplo, n, A, ldA, info);
 }


 /** spotrf's recursive compute kernel */
 static void RELAPACK_spotrf_rec(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_SPOTRF, 1)) {
        // Unblocked
        LAPACK(spotf2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1. };
    const float MONE[] = { -1. };

    // Splitting
    const int n1 = SREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + *ldA * n1;
    float *const A_BL = A             + n1;
    float *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_spotrf_rec(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = A_BL / A_TL'
        BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
        // A_BR = A_BR - A_BL * A_BL'
        BLAS(ssyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
    } else {
        // A_TR = A_TL' \ A_TR
        BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
        // A_BR = A_BR - A_TR' * A_TR
        BLAS(ssyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_spotrf_rec(uplo, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/ssygst.c
+++ b/relapack/src/ssygst.c
@@ -0,0 +1,212 @@
 #include "relapack.h"
 #if XSYGST_ALLOW_MALLOC
 #include "stdlib.h"
 #endif

 static void RELAPACK_ssygst_rec(const int *, const char *, const int *,
    float *, const int *, const float *, const int *,
    float *, const int *, int *);


 /** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
 *
 * This routine is functionally equivalent to LAPACK's ssygst.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html
 * */
 void RELAPACK_ssygst(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (*itype < 1 || *itype > 3)
        *info = -1;
    else if (!lower && !upper)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    else if (*ldB < MAX(1, *n))
        *info = -7;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("SSYGST", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Allocate work space
    float *Work = NULL;
    int   lWork = 0;
 #if XSYGST_ALLOW_MALLOC
    const int n1 = SREC_SPLIT(*n);
    lWork = n1 * (*n - n1);
    Work  = malloc(lWork * sizeof(float));
    if (!Work)
        lWork = 0;
 #endif

    // Recursive kernel
    RELAPACK_ssygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);

    // Free work space
 #if XSYGST_ALLOW_MALLOC
    if (Work)
        free(Work);
 #endif
 }


 /** ssygst's recursive compute kernel */
 static void RELAPACK_ssygst_rec(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    float *Work, const int *lWork, int *info
 ) {

    if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
        // Unblocked
        LAPACK(ssygs2)(itype, uplo, n, A, ldA, B, ldB, info);
        return;
    }

    // Constants
    const float ZERO[]  = { 0. };
    const float ONE[]   = { 1. };
    const float MONE[]  = { -1. };
    const float HALF[]  = { .5 };
    const float MHALF[] = { -.5 };
    const int   iONE[]  = { 1 };

    // Loop iterator
    int i;

    // Splitting
    const int n1 = SREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + *ldA * n1;
    float *const A_BL = A             + n1;
    float *const A_BR = A + *ldA * n1 + n1;

    // B_TL B_TR
    // B_BL B_BR
    const float *const B_TL = B;
    const float *const B_TR = B + *ldB * n1;
    const float *const B_BL = B             + n1;
    const float *const B_BR = B + *ldB * n1 + n1;

    // recursion(A_TL, B_TL)
    RELAPACK_ssygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);

    if (*itype == 1)
        if (*uplo == 'L') {
            // A_BL = A_BL / B_TL'
            BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork > n2 * n1) {
                // T = -1/2 * B_BL * A_TL
                BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
            } else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
            BLAS(ssyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
            if (*lWork > n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
            else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR \ A_BL
            BLAS(strsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL' \ A_TR
            BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork > n2 * n1) {
                // T = -1/2 * A_TL * B_TR
                BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
            } else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
            BLAS(ssyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
            if (*lWork > n2 * n1)
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
            else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR / B_BR
            BLAS(strsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }
    else
        if (*uplo == 'L') {
            // A_BL = A_BL * B_TL
            BLAS(strmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork > n2 * n1) {
                // T = 1/2 * A_BR * B_BL
                BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
            } else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
            BLAS(ssyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
            if (*lWork > n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
            else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR * A_BL
            BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL * A_TR
            BLAS(strmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork > n2 * n1) {
                // T = 1/2 * B_TR * A_BR
                BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
            } else
                // A_TR = A_TR + 1/2 B_TR A_BR
                BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
            BLAS(ssyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
            if (*lWork > n2 * n1)
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
            else
                // A_TR = A_TR + 1/2 B_TR * A_BR
                BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR * B_BR
            BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }

    // recursion(A_BR, B_BR)
    RELAPACK_ssygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
 }
--- a/relapack/src/ssytrf.c
+++ b/relapack/src/ssytrf.c
@@ -0,0 +1,238 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's ssytrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html
 * */
 void RELAPACK_ssytrf(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("SSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy arguments
    int nout;

    // Recursive kernel
    RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** ssytrf's recursive compute kernel */
 static void RELAPACK_ssytrf_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_SSYTRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_ssytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1. };
    const float MONE[] = { -1. };
    const int   iONE[] = { 1 };

    // Loop iterator
    int i;

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = SREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A             + n1;
        float *const A_BR   = A + *ldA * n1 + n1;
        float *const A_BL_B = A             + *n;
        float *const A_BR_B = A + *ldA * n1 + *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                + n1;
        float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = SREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1  = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + *ldA * n_rest;
        float *const A_TR_T = A + *ldA * (n_rest + n1);
        float *const A_TL   = A + *ldA * n_rest        + n_rest;
        float *const A_TR   = A + *ldA * (n_rest + n1) + n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/ssytrf_rec2.c
+++ b/relapack/src/ssytrf_rec2.c
@@ -0,0 +1,351 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static int c__1 = 1;
 static float c_b8 = -1.f;
 static float c_b9 = 1.f;

 /** SSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
 *
 * This routine is a minor modification of LAPACK's slasyf.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int *
 	nb, int *kb, float *a, int *lda, int *ipiv, float *w,
 	int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
    float r__1, r__2, r__3;

    /* Builtin functions */
    double sqrt(double);

    /* Local variables */
    static int j, k;
    static float t, r1, d11, d21, d22;
    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
    static float alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int sscal_(int *, float *, float *, int *),
 	    sgemv_(char *, int *, int *, float *, float *, int *,
 	    float *, int *, float *, float *, int *, ftnlen);
    static int kstep;
    extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
 	    int *), sswap_(int *, float *, int *, float *, int *
 	    );
    static float absakk;
    extern int isamax_(int *, float *, int *);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	kstep = 1;
 	absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		i__1 = k - imax;
 		jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
 			 &c__1);
 		rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    jmax = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 /* Computing MAX */
 		    r__2 = rowmax, r__3 = (r__1 = w[jmax + (kw - 1) * w_dim1],
 			     dabs(r__1));
 		    rowmax = dmax(r__2,r__3);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else if ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) >=
 			 alpha * rowmax) {
 		    kp = imax;
 		    scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		} else {
 		    kp = imax;
 		    kstep = 2;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kp != kk) {
 		a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
 		i__1 = kk - 1 - kp;
 		scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    scopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    sswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		r1 = 1.f / a[k + k * a_dim1];
 		i__1 = k - 1;
 		sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 	    } else {
 		if (k > 2) {
 		    d21 = w[k - 1 + kw * w_dim1];
 		    d11 = w[k + kw * w_dim1] / d21;
 		    d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
 		    t = 1.f / (d11 * d22 - 1.f);
 		    d21 = t / d21;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
 				* w_dim1] - w[j + kw * w_dim1]);
 			a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
 				w[j + (kw - 1) * w_dim1]);
 /* L20: */
 		    }
 		}
 		a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
 		a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
 		a[k + k * a_dim1] = w[k + kw * w_dim1];
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    ++j;
 	}
 	++j;
 	if (jp != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    sswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
 	}
 	if (j < *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	i__1 = *n - k + 1;
 	scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	i__1 = *n - k + 1;
 	i__2 = k - 1;
 	sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
 		+ w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
 	kstep = 1;
 	absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - k;
 		scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		i__1 = *n - k + 1;
 		i__2 = k - 1;
 		sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
 			lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
 			w_dim1], &c__1, (ftnlen)12);
 		i__1 = imax - k;
 		jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
 			;
 		rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    jmax = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 /* Computing MAX */
 		    r__2 = rowmax, r__3 = (r__1 = w[jmax + (k + 1) * w_dim1],
 			    dabs(r__1));
 		    rowmax = dmax(r__2,r__3);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else if ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) >=
 			alpha * rowmax) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		} else {
 		    kp = imax;
 		    kstep = 2;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kp != kk) {
 		a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
 		i__1 = kp - kk - 1;
 		scopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    sswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    r1 = 1.f / a[k + k * a_dim1];
 		    i__1 = *n - k;
 		    sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    d21 = w[k + 1 + k * w_dim1];
 		    d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
 		    d22 = w[k + k * w_dim1] / d21;
 		    t = 1.f / (d11 * d22 - 1.f);
 		    d21 = t / d21;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
 				w[j + (k + 1) * w_dim1]);
 			a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
 				 w_dim1] - w[j + k * w_dim1]);
 /* L80: */
 		    }
 		}
 		a[k + k * a_dim1] = w[k + k * w_dim1];
 		a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
 		a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    --j;
 	}
 	--j;
 	if (jp != jj && j >= 1) {
 	    sswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j > 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/ssytrf_rook.c
+++ b/relapack/src/ssytrf_rook.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's ssytrf_rook.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html
 * */
 void RELAPACK_ssytrf_rook(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("SSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** ssytrf_rook's recursive compute kernel */
 static void RELAPACK_ssytrf_rook_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) {
        // Unblocked
        if (top) {
            LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_ssytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1. };
    const float MONE[] = { -1. };
    const int   iONE[]  = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = SREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2   = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A             + n1;
        float *const A_BR   = A + *ldA * n1 + n1;
        float *const A_BL_B = A             + *n;
        float *const A_BR_B = A + *ldA * n1 + *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                + n1;
        float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = SREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + *ldA * n_rest;
        float *const A_TR_T = A + *ldA * (n_rest + n1);
        float *const A_TL   = A + *ldA * n_rest        + n_rest;
        float *const A_TR   = A + *ldA * (n_rest + n1) + n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/ssytrf_rook_rec2.c
+++ b/relapack/src/ssytrf_rook_rec2.c
@@ -0,0 +1,451 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static int c__1 = 1;
 static float c_b9 = -1.f;
 static float c_b10 = 1.f;

 /** SSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
 *
 * This routine is a minor modification of LAPACK's slasyf_rook.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n,
 	int *nb, int *kb, float *a, int *lda, int *ipiv, float *
 	w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
    float r__1;

    /* Builtin functions */
    double sqrt(double);

    /* Local variables */
    static int j, k, p;
    static float t, r1, d11, d12, d21, d22;
    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
    static logical done;
    static int imax, jmax;
    static float alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int sscal_(int *, float *, float *, int *);
    static float sfmin;
    static int itemp;
    extern /* Subroutine */ int sgemv_(char *, int *, int *, float *,
 	    float *, int *, float *, int *, float *, float *, int *,
 	    ftnlen);
    static int kstep;
    static float stemp;
    extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
 	    int *), sswap_(int *, float *, int *, float *, int *
 	    );
    static float absakk;
    extern double slamch_(char *, ftnlen);
    extern int isamax_(int *, float *, int *);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    sfmin = slamch_("S", (ftnlen)1);
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	p = k;
 	scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L12:
 		scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = k - imax;
 		    jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) *
 			    w_dim1], &c__1);
 		    rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    itemp = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 		    stemp = (r__1 = w[itemp + (kw - 1) * w_dim1], dabs(r__1));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		if (! ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) <
 			alpha * rowmax)) {
 		    kp = imax;
 		    scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		}
 		if (! done) {
 		    goto L12;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kstep == 2 && p != k) {
 		i__1 = k - p;
 		scopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
 			a_dim1], lda);
 		scopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - k + 1;
 		sswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - kk + 1;
 		sswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
 			 ldw);
 	    }
 	    if (kp != kk) {
 		a[kp + k * a_dim1] = a[kk + k * a_dim1];
 		i__1 = k - 1 - kp;
 		scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		scopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - kk + 1;
 		sswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
 			 lda);
 		i__1 = *n - kk + 1;
 		sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
 			r1 = 1.f / a[k + k * a_dim1];
 			i__1 = k - 1;
 			sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    } else if (a[k + k * a_dim1] != 0.f) {
 			i__1 = k - 1;
 			for (ii = 1; ii <= i__1; ++ii) {
 			    a[ii + k * a_dim1] /= a[k + k * a_dim1];
 /* L14: */
 			}
 		    }
 		}
 	    } else {
 		if (k > 2) {
 		    d12 = w[k - 1 + kw * w_dim1];
 		    d11 = w[k + kw * w_dim1] / d12;
 		    d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
 		    t = 1.f / (d11 * d22 - 1.f);
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
 				 w_dim1] - w[j + kw * w_dim1]) / d12);
 			a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
 				w[j + (kw - 1) * w_dim1]) / d12);
 /* L20: */
 		    }
 		}
 		a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
 		a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
 		a[k + k * a_dim1] = w[k + kw * w_dim1];
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    ++j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	++j;
 	if (jp2 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    sswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	jj = j - 1;
 	if (jp1 != jj && kstep == 2) {
 	    i__1 = *n - j + 1;
 	    sswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	if (j <= *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	p = k;
 	i__1 = *n - k + 1;
 	scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	if (k > 1) {
 	    i__1 = *n - k + 1;
 	    i__2 = k - 1;
 	    sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
 		    w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
 		    ftnlen)12);
 	}
 	absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = *n - k + 1;
 	    scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 		    c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L72:
 		i__1 = imax - k;
 		scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		if (k > 1) {
 		    i__1 = *n - k + 1;
 		    i__2 = k - 1;
 		    sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
 			    , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
 			    + 1) * w_dim1], &c__1, (ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = imax - k;
 		    jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &
 			    c__1);
 		    rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    itemp = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 		    stemp = (r__1 = w[itemp + (k + 1) * w_dim1], dabs(r__1));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		if (! ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) <
 			alpha * rowmax)) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    i__1 = *n - k + 1;
 		    scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		}
 		if (! done) {
 		    goto L72;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kstep == 2 && p != k) {
 		i__1 = p - k;
 		scopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - p + 1;
 		scopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
 			c__1);
 		sswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
 		sswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
 	    }
 	    if (kp != kk) {
 		a[kp + k * a_dim1] = a[kk + k * a_dim1];
 		i__1 = kp - k - 1;
 		scopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
 			* a_dim1], lda);
 		i__1 = *n - kp + 1;
 		scopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
 			a_dim1], &c__1);
 		sswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
 			r1 = 1.f / a[k + k * a_dim1];
 			i__1 = *n - k;
 			sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    } else if (a[k + k * a_dim1] != 0.f) {
 			i__1 = *n;
 			for (ii = k + 1; ii <= i__1; ++ii) {
 			    a[ii + k * a_dim1] /= a[k + k * a_dim1];
 /* L74: */
 			}
 		    }
 		}
 	    } else {
 		if (k < *n - 1) {
 		    d21 = w[k + 1 + k * w_dim1];
 		    d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
 		    d22 = w[k + k * w_dim1] / d21;
 		    t = 1.f / (d11 * d22 - 1.f);
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
 				j + (k + 1) * w_dim1]) / d21);
 			a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
 				w_dim1] - w[j + k * w_dim1]) / d21);
 /* L80: */
 		    }
 		}
 		a[k + k * a_dim1] = w[k + k * w_dim1];
 		a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
 		a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    --j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	--j;
 	if (jp2 != jj && j >= 1) {
 	    sswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	jj = j + 1;
 	if (jp1 != jj && kstep == 2) {
 	    sswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j >= 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/stgsyl.c
+++ b/relapack/src/stgsyl.c
@@ -0,0 +1,274 @@
 #include "relapack.h"
 #include <math.h>

 static void RELAPACK_stgsyl_rec(const char *, const int *, const int *,
    const int *, const float *, const int *, const float *, const int *,
    float *, const int *, const float *, const int *, const float *,
    const int *, float *, const int *, float *, float *, float *, int *, int *,
    int *);


 /** STGSYL solves the generalized Sylvester equation.
 *
 * This routine is functionally equivalent to LAPACK's stgsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html
 * */
 void RELAPACK_stgsyl(
    const char *trans, const int *ijob, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dif,
    float *Work, const int *lWork, int *iWork, int *info
 ) {

    // Parse arguments
    const int notran = LAPACK(lsame)(trans, "N");
    const int tran = LAPACK(lsame)(trans, "T");

    // Compute work buffer size
    int lwmin = 1;
    if (notran && (*ijob == 1 || *ijob == 2))
        lwmin = MAX(1, 2 * *m * *n);
    *info = 0;

    // Check arguments
    if (!tran && !notran)
        *info = -1;
    else if (notran && (*ijob < 0 || *ijob > 4))
        *info = -2;
    else if (*m <= 0)
        *info = -3;
    else if (*n <= 0)
        *info = -4;
    else if (*ldA < MAX(1, *m))
        *info = -6;
    else if (*ldB < MAX(1, *n))
        *info = -8;
    else if (*ldC < MAX(1, *m))
        *info = -10;
    else if (*ldD < MAX(1, *m))
        *info = -12;
    else if (*ldE < MAX(1, *n))
        *info = -14;
    else if (*ldF < MAX(1, *m))
        *info = -16;
    else if (*lWork < lwmin && *lWork != -1)
        *info = -20;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("STGSYL", &minfo);
        return;
    }

    if (*lWork == -1) {
        // Work size query
        *Work = lwmin;
        return;
    }

    // Clean char * arguments
    const char cleantrans = notran ? 'N' : 'T';

    // Constant
    const float ZERO[] = { 0. };

    int isolve = 1;
    int ifunc  = 0;
    if (notran) {
        if (*ijob >= 3) {
            ifunc = *ijob - 2;
            LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
            LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
        } else if (*ijob >= 1)
            isolve = 2;
    }

    float scale2;
    int iround;
    for (iround = 1; iround <= isolve; iround++) {
        *scale = 1;
        float dscale = 0;
        float dsum   = 1;
        int pq;
        RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
        if (dscale != 0) {
            if (*ijob == 1 || *ijob == 3)
                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
            else
                *dif = sqrt(pq) / (dscale * sqrt(dsum));
        }
        if (isolve == 2) {
            if (iround == 1) {
                if (notran)
                    ifunc = *ijob;
                scale2 = *scale;
                LAPACK(slacpy)("F", m, n, C, ldC, Work, m);
                LAPACK(slacpy)("F", m, n, F, ldF, Work + *m * *n, m);
                LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
                LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
            } else {
                LAPACK(slacpy)("F", m, n, Work, m, C, ldC);
                LAPACK(slacpy)("F", m, n, Work + *m * *n, m, F, ldF);
                *scale = scale2;
            }
        }
    }
 }


 /** stgsyl's recursive vompute kernel */
 static void RELAPACK_stgsyl_rec(
    const char *trans, const int *ifunc, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dsum, float *dscale,
    int *iWork, int *pq, int *info
 ) {

    if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) {
        // Unblocked
        LAPACK(stgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1. };
    const float MONE[] = { -1. };
    const int   iONE[] = { 1 };

    // Outputs
    float scale1[] = { 1. };
    float scale2[] = { 1. };
    int   info1[]  = { 0 };
    int   info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        int m1 = SREC_SPLIT(*m);
        if (A[m1 + *ldA * (m1 - 1)])
            m1++;
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const float *const A_TL = A;
        const float *const A_TR = A + *ldA * m1;
        const float *const A_BR = A + *ldA * m1 + m1;

        // C_T
        // C_B
        float *const C_T = C;
        float *const C_B = C + m1;

        // D_TL D_TR
        // 0    D_BR
        const float *const D_TL = D;
        const float *const D_TR = D + *ldD * m1;
        const float *const D_BR = D + *ldD * m1 + m1;

        // F_T
        // F_B
        float *const F_T = F;
        float *const F_B = F + m1;

        if (*trans == 'N') {
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // F_T = F_T - D_TR * C_B
            BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
            }
        } else {
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(slascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
            // C_B = C_B - A_TR^H * C_T
            BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // C_B = C_B - D_TR^H * F_T
            BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
            }
        }
    } else {
        // Splitting
        int n1 = SREC_SPLIT(*n);
        if (B[n1 + *ldB * (n1 - 1)])
            n1++;
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const float *const B_TL = B;
        const float *const B_TR = B + *ldB * n1;
        const float *const B_BR = B + *ldB * n1 + n1;

        // C_L C_R
        float *const C_L = C;
        float *const C_R = C + *ldC * n1;

        // E_TL E_TR
        // 0    E_BR
        const float *const E_TL = E;
        const float *const E_TR = E + *ldE * n1;
        const float *const E_BR = E + *ldE * n1 + n1;

        // F_L F_R
        float *const F_L = F;
        float *const F_R = F + *ldF * n1;

        if (*trans == 'N') {
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
            // C_R = C_R + F_L * B_TR
            BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
            // F_R = F_R + F_L * E_TR
            BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
            }
        } else {
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(slascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
            // F_L = F_L + C_R * B_TR
            BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
            // F_L = F_L + F_R * E_TR
            BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
            }
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/strsyl.c
+++ b/relapack/src/strsyl.c
@@ -0,0 +1,169 @@
 #include "relapack.h"

 static void RELAPACK_strsyl_rec(const char *, const char *, const int *,
    const int *, const int *, const float *, const int *, const float *,
    const int *, float *, const int *, float *, int *);


 /** STRSYL solves the real Sylvester matrix equation.
 *
 * This routine is functionally equivalent to LAPACK's strsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html
 * */
 void RELAPACK_strsyl(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {

    // Check arguments
    const int notransA = LAPACK(lsame)(tranA, "N");
    const int transA = LAPACK(lsame)(tranA, "T");
    const int ctransA = LAPACK(lsame)(tranA, "C");
    const int notransB = LAPACK(lsame)(tranB, "N");
    const int transB = LAPACK(lsame)(tranB, "T");
    const int ctransB = LAPACK(lsame)(tranB, "C");
    *info = 0;
    if (!transA && !ctransA && !notransA)
        *info = -1;
    else if (!transB && !ctransB && !notransB)
        *info = -2;
    else if (*isgn != 1 && *isgn != -1)
        *info = -3;
    else if (*m < 0)
        *info = -4;
    else if (*n < 0)
        *info = -5;
    else if (*ldA < MAX(1, *m))
        *info = -7;
    else if (*ldB < MAX(1, *n))
        *info = -9;
    else if (*ldC < MAX(1, *m))
        *info = -11;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("STRSYL", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
    const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');

    // Recursive kernel
    RELAPACK_strsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }


 /** strsyl's recursive compute kernel */
 static void RELAPACK_strsyl_rec(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {

    if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) {
        // Unblocked
        RELAPACK_strsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1. };
    const float MONE[] = { -1. };
    const float MSGN[] = { -*isgn };
    const int   iONE[] = { 1 };

    // Outputs
    float scale1[] = { 1. };
    float scale2[] = { 1. };
    int   info1[]  = { 0 };
    int   info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        int m1 = SREC_SPLIT(*m);
        if (A[m1 + *ldA * (m1 - 1)])
            m1++;
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const float *const A_TL = A;
        const float *const A_TR = A + *ldA * m1;
        const float *const A_BR = A + *ldA * m1 + m1;

        // C_T
        // C_B
        float *const C_T = C;
        float *const C_B = C + m1;

        if (*tranA == 'N') {
            // recusion(A_BR, B, C_B)
            RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // recusion(A_TL, B, C_T)
            RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
        } else {
            // recusion(A_TL, B, C_T)
            RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
            // C_B = C_B - A_TR' * C_T
            BLAS(sgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // recusion(A_BR, B, C_B)
            RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
        }
    } else {
        // Splitting
        int n1 = SREC_SPLIT(*n);
        if (B[n1 + *ldB * (n1 - 1)])
            n1++;
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const float *const B_TL = B;
        const float *const B_TR = B + *ldB * n1;
        const float *const B_BR = B + *ldB * n1 + n1;

        // C_L C_R
        float *const C_L = C;
        float *const C_R = C + *ldC * n1;

        if (*tranB == 'N') {
            // recusion(A, B_TL, C_L)
            RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
            // C_R = C_R -/+ C_L * B_TR
            BLAS(sgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
            // recusion(A, B_BR, C_R)
            RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
        } else {
            // recusion(A, B_BR, C_R)
            RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
            // C_L = C_L -/+ C_R * B_TR'
            BLAS(sgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
            // recusion(A, B_TL, C_L)
            RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/strsyl_rec2.c
+++ b/relapack/src/strsyl_rec2.c
--- a/relapack/src/strtri.c
+++ b/relapack/src/strtri.c
@@ -0,0 +1,107 @@
 #include "relapack.h"

 static void RELAPACK_strtri_rec(const char *, const char *, const int *,
    float *, const int *, int *);


 /** CTRTRI computes the inverse of a real upper or lower triangular matrix A.
 *
 * This routine is functionally equivalent to LAPACK's strtri.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html
 * */
 void RELAPACK_strtri(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int nounit = LAPACK(lsame)(diag, "N");
    const int unit = LAPACK(lsame)(diag, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (!nounit && !unit)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("STRTRI", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower  ? 'L' : 'U';
    const char cleandiag = nounit ? 'N' : 'U';

    // check for singularity
    if (nounit) {
        int i;
        for (i = 0; i < *n; i++)
            if (A[i + *ldA * i] == 0) {
                *info = i;
                return;
            }
    }

    // Recursive kernel
    RELAPACK_strtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
 }


 /** strtri's recursive compute kernel */
 static void RELAPACK_strtri_rec(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_STRTRI, 1)) {
        // Unblocked
        LAPACK(strti2)(uplo, diag, n, A, ldA, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1. };
    const float MONE[] = { -1. };

    // Splitting
    const int n1 = SREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + *ldA * n1;
    float *const A_BL = A             + n1;
    float *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_strtri_rec(uplo, diag, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = - A_BL * A_TL
        BLAS(strmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
        // A_BL = A_BR \ A_BL
        BLAS(strsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TR = - A_TL * A_TR
        BLAS(strmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
        // A_TR = A_TR / A_BR
        BLAS(strsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_strtri_rec(uplo, diag, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/zgbtrf.c
+++ b/relapack/src/zgbtrf.c
@@ -0,0 +1,230 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *,
    const int *, double *, const int *, int *, double *, const int *, double *,
    const int *, int *);


 /** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's zgbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html
 * */
 void RELAPACK_zgbtrf(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kl < 0)
        *info = -3;
    else if (*ku < 0)
        *info = -4;
    else if (*ldAb < 2 * *kl + *ku + 1)
        *info = -6;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZGBTRF", &minfo);
        return;
    }

    // Constant
    const double ZERO[] = { 0., 0. };

    // Result upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    double *const A = Ab + 2 * kv;

    // Zero upper diagonal fill-in elements
    int i, j;
    for (j = 0; j < *n; j++) {
        double *const A_j = A + 2 * *ldA * j;
        for (i = MAX(0, j - kv); i < j - *ku; i++)
            A_j[2 * i] = A_j[2 * i + 1] = 0.;
    }

    // Allocate work space
    const int n1 = ZREC_SPLIT(*n);
    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
    const int nWorkl = (kv > n1) ? n1 : kv;
    const int mWorku = (*kl > n1) ? n1 : *kl;
    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
    double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double));
    double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double));
    LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
    LAPACK(zlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);

    // Recursive kernel
    RELAPACK_zgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);

    // Free work space
    free(Workl);
    free(Worku);
 }


 /** zgbtrf's recursive compute kernel */
 static void RELAPACK_zgbtrf_rec(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) {
        // Unblocked
        LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };
    const int    iONE[] = { 1 };

    // Loop iterators
    int i, j;

    // Output upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    double *const A = Ab + 2 * kv;

    // Splitting
    const int n1  = MIN(ZREC_SPLIT(*n), *kl);
    const int n2  = *n - n1;
    const int m1  = MIN(n1, *m);
    const int m2  = *m - m1;
    const int mn1 = MIN(m1, n1);
    const int mn2 = MIN(m2, n2);

    // Ab_L *
    //      Ab_BR
    double *const Ab_L  = Ab;
    double *const Ab_BR = Ab + 2 * *ldAb * n1;

    // A_L A_R
    double *const A_L = A;
    double *const A_R = A + 2 * *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + 2 * *ldA * n1;
    double *const A_BL = A                 + 2 * m1;
    double *const A_BR = A + 2 * *ldA * n1 + 2 * m1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // Banded splitting
    const int n21 = MIN(n2, kv - n1);
    const int n22 = MIN(n2 - n21, n1);
    const int m21 = MIN(m2, *kl - m1);
    const int m22 = MIN(m2 - m21, m1);

    //   n1 n21  n22
    // m *  A_Rl ARr
    double *const A_Rl = A_R;
    double *const A_Rr = A_R + 2 * *ldA * n21;

    //     n1    n21    n22
    // m1  *     A_TRl  A_TRr
    // m21 A_BLt A_BRtl A_BRtr
    // m22 A_BLb A_BRbl A_BRbr
    double *const A_TRl  = A_TR;
    double *const A_TRr  = A_TR + 2 * *ldA * n21;
    double *const A_BLt  = A_BL;
    double *const A_BLb  = A_BL                  + 2 * m21;
    double *const A_BRtl = A_BR;
    double *const A_BRtr = A_BR + 2 * *ldA * n21;
    double *const A_BRbl = A_BR                  + 2 * m21;
    double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;

    // recursion(Ab_L, ipiv_T)
    RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);

    // Workl = A_BLb
    LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);

    // partially redo swaps in A_L
    for (i = 0; i < mn1; i++) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
            else
                BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
        }
    }

    // apply pivots to A_Rl
    LAPACK(zlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);

    // apply pivots to A_Rr columnwise
    for (j = 0; j < n22; j++) {
        double *const A_Rrj = A_Rr + 2 * *ldA * j;
        for (i = j; i < mn1; i++) {
            const int ip = ipiv_T[i] - 1;
            if (ip != i) {
                const double tmpr = A_Rrj[2 * i];
                const double tmpc = A_Rrj[2 * i + 1];
                A_Rrj[2 * i]     = A_Rrj[2 * ip];
                A_Rrj[2 * i + 1] = A_Rrj[2 * ip + 1];
                A_Rrj[2 * ip]     = tmpr;
                A_Rrj[2 * ip + 1] = tmpc;
            }
        }
    }

    // A_TRl = A_TL \ A_TRl
    BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
    // Worku = A_TRr
    LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
    // Worku = A_TL \ Worku
    BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
    // A_TRr = Worku
    LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
    // A_BRtl = A_BRtl - A_BLt * A_TRl
    BLAS(zgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
    // A_BRbl = A_BRbl - Workl * A_TRl
    BLAS(zgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
    // A_BRtr = A_BRtr - A_BLt * Worku
    BLAS(zgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
    // A_BRbr = A_BRbr - Workl * Worku
    BLAS(zgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);

    // partially undo swaps in A_L
    for (i = mn1 - 1; i >= 0; i--) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
            else
                BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
        }
    }

    // recursion(Ab_BR, ipiv_B)
    RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
    if (*info)
        *info += n1;
    // shift pivots
    for (i = 0; i < mn2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/zgemmt.c
+++ b/relapack/src/zgemmt.c
@@ -0,0 +1,167 @@
 #include "relapack.h"

 static void RELAPACK_zgemmt_rec(const char *, const char *, const char *,
    const int *, const int *, const double *, const double *, const int *,
    const double *, const int *, const double *, double *, const int *);

 static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *,
    const int *, const int *, const double *, const double *, const int *,
    const double *, const int *, const double *, double *, const int *);


 /** ZGEMMT computes a matrix-matrix product with general matrices but updates
 * only the upper or lower triangular part of the result matrix.
 *
 * This routine performs the same operation as the BLAS routine
 * zgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
 * but only updates the triangular part of C specified by uplo:
 * If (*uplo == 'L'), only the lower triangular part of C is updated,
 * otherwise the upper triangular part is updated.
 * */
 void RELAPACK_zgemmt(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {

 #if HAVE_XGEMMT
    BLAS(zgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
    return;
 #else

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int notransA = LAPACK(lsame)(transA, "N");
    const int tranA = LAPACK(lsame)(transA, "T");
    const int ctransA = LAPACK(lsame)(transA, "C");
    const int notransB = LAPACK(lsame)(transB, "N");
    const int tranB = LAPACK(lsame)(transB, "T");
    const int ctransB = LAPACK(lsame)(transB, "C");
    int info = 0;
    if (!lower && !upper)
        info = 1;
    else if (!tranA && !ctransA && !notransA)
        info = 2;
    else if (!tranB && !ctransB && !notransB)
        info = 3;
    else if (*n < 0)
        info = 4;
    else if (*k < 0)
        info = 5;
    else if (*ldA < MAX(1, notransA ? *n : *k))
        info = 8;
    else if (*ldB < MAX(1, notransB ? *k : *n))
        info = 10;
    else if (*ldC < MAX(1, *n))
        info = 13;
    if (info) {
        LAPACK(xerbla)("ZGEMMT", &info);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';
    const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
    const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');

    // Recursive kernel
    RELAPACK_zgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
 #endif
 }


 /** zgemmt's recursive compute kernel */
 static void RELAPACK_zgemmt_rec(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {

    if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) {
        // Unblocked
        RELAPACK_zgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
        return;
    }

    // Splitting
    const int n1 = ZREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_T
    // A_B
    const double *const A_T = A;
    const double *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);

    // B_L B_R
    const double *const B_L = B;
    const double *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);

    // C_TL C_TR
    // C_BL C_BR
    double *const C_TL = C;
    double *const C_TR = C + 2 * *ldC * n1;
    double *const C_BL = C                 + 2 * n1;
    double *const C_BR = C + 2 * *ldC * n1 + 2 * n1;

    // recursion(C_TL)
    RELAPACK_zgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);

    if (*uplo == 'L')
        // C_BL = alpha A_B B_L + beta C_BL
        BLAS(zgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
    else
        // C_TR = alpha A_T B_R + beta C_TR
        BLAS(zgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);

    // recursion(C_BR)
    RELAPACK_zgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
 }


 /** zgemmt's unblocked compute kernel */
 static void RELAPACK_zgemmt_rec2(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {

    const int incB = (*transB == 'N') ? 1 : *ldB;
    const int incC = 1;

    int i;
    for (i = 0; i < *n; i++) {
        // A_0
        // A_i
        const double *const A_0 = A;
        const double *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);

        // * B_i *
        const double *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);

        // * C_0i *
        // * C_ii *
        double *const C_0i = C + 2 * *ldC * i;
        double *const C_ii = C + 2 * *ldC * i + 2 * i;

        if (*uplo == 'L') {
            const int nmi = *n - i;
            if (*transA == 'N')
                BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
            else
                BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
        } else {
            const int ip1 = i + 1;
            if (*transA == 'N')
                BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
            else
                BLAS(zgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
        }
    }
 }
--- a/relapack/src/zgetrf.c
+++ b/relapack/src/zgetrf.c
@@ -0,0 +1,117 @@
 #include "relapack.h"

 static void RELAPACK_zgetrf_rec(const int *, const int *, double *,
    const int *, int *, int *);


 /** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's zgetrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html
 * */
 void RELAPACK_zgetrf(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZGETRF", &minfo);
        return;
    }

    const int sn = MIN(*m, *n);

    RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info);

    // Right remainder
    if (*m < *n) {
        // Constants
        const double ONE[]  = { 1., 0. };
        const int    iONE[] = { 1 };

        // Splitting
        const int rn = *n - *m;

        // A_L A_R
        const double *const A_L = A;
        double *const       A_R = A + 2 * *ldA * *m;

        // A_R = apply(ipiv, A_R)
        LAPACK(zlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
        // A_R = A_L \ A_R
        BLAS(ztrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
    }
 }


 /** zgetrf's recursive compute kernel */
 static void RELAPACK_zgetrf_rec(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_ZGETRF, 1)) {
        // Unblocked
        LAPACK(zgetf2)(m, n, A, ldA, ipiv, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };
    const int    iONE[] = { 1. };

    // Splitting
    const int n1 = ZREC_SPLIT(*n);
    const int n2 = *n - n1;
    const int m2 = *m - n1;

    // A_L A_R
    double *const A_L = A;
    double *const A_R = A + 2 * *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + 2 * *ldA * n1;
    double *const A_BL = A                 + 2 * n1;
    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // recursion(A_L, ipiv_T)
    RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
    // apply pivots to A_R
    LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);

    // A_TR = A_TL \ A_TR
    BLAS(ztrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
    // A_BR = A_BR - A_BL * A_TR
    BLAS(zgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);

    // recursion(A_BR, ipiv_B)
    RELAPACK_zgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
    if (*info)
        *info += n1;
    // apply pivots to A_BL
    LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
    // shift pivots
    int i;
    for (i = 0; i < n2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/zhegst.c
+++ b/relapack/src/zhegst.c
@@ -0,0 +1,212 @@
 #include "relapack.h"
 #if XSYGST_ALLOW_MALLOC
 #include "stdlib.h"
 #endif

 static void RELAPACK_zhegst_rec(const int *, const char *, const int *,
    double *, const int *, const double *, const int *,
    double *, const int *, int *);


 /** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
 *
 * This routine is functionally equivalent to LAPACK's zhegst.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html
 * */
 void RELAPACK_zhegst(
    const int *itype, const char *uplo, const int *n,
    double *A, const int *ldA, const double *B, const int *ldB,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (*itype < 1 || *itype > 3)
        *info = -1;
    else if (!lower && !upper)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    else if (*ldB < MAX(1, *n))
        *info = -7;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZHEGST", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Allocate work space
    double *Work = NULL;
    int    lWork = 0;
 #if XSYGST_ALLOW_MALLOC
    const int n1 = ZREC_SPLIT(*n);
    lWork = n1 * (*n - n1);
    Work  = malloc(lWork * 2 * sizeof(double));
    if (!Work)
        lWork = 0;
 #endif

    // recursive kernel
    RELAPACK_zhegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);

    // Free work space
 #if XSYGST_ALLOW_MALLOC
    if (Work)
        free(Work);
 #endif
 }


 /** zhegst's recursive compute kernel */
 static void RELAPACK_zhegst_rec(
    const int *itype, const char *uplo, const int *n,
    double *A, const int *ldA, const double *B, const int *ldB,
    double *Work, const int *lWork, int *info
 ) {

    if (*n <= MAX(CROSSOVER_ZHEGST, 1)) {
        // Unblocked
        LAPACK(zhegs2)(itype, uplo, n, A, ldA, B, ldB, info);
        return;
    }

    // Constants
    const double ZERO[]  = { 0., 0. };
    const double ONE[]   = { 1., 0. };
    const double MONE[]  = { -1., 0. };
    const double HALF[]  = { .5, 0. };
    const double MHALF[] = { -.5, 0. };
    const int    iONE[]  = { 1 };

    // Loop iterator
    int i;

    // Splitting
    const int n1 = ZREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + 2 * *ldA * n1;
    double *const A_BL = A                 + 2 * n1;
    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // B_TL B_TR
    // B_BL B_BR
    const double *const B_TL = B;
    const double *const B_TR = B + 2 * *ldB * n1;
    const double *const B_BL = B                 + 2 * n1;
    const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;

    // recursion(A_TL, B_TL)
    RELAPACK_zhegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);

    if (*itype == 1)
        if (*uplo == 'L') {
            // A_BL = A_BL / B_TL'
            BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork >= n2 * n1) {
                // T = -1/2 * B_BL * A_TL
                BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            } else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
            BLAS(zher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
            if (*lWork >= n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR \ A_BL
            BLAS(ztrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL' \ A_TR
            BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork >= n2 * n1) {
                // T = -1/2 * A_TL * B_TR
                BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            } else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
            BLAS(zher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
            if (*lWork >= n2 * n1)
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR / B_BR
            BLAS(ztrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }
    else
        if (*uplo == 'L') {
            // A_BL = A_BL * B_TL
            BLAS(ztrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork >= n2 * n1) {
                // T = 1/2 * A_BR * B_BL
                BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            } else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
            BLAS(zher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
            if (*lWork >= n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR * A_BL
            BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL * A_TR
            BLAS(ztrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork >= n2 * n1) {
                // T = 1/2 * B_TR * A_BR
                BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            } else
                // A_TR = A_TR + 1/2 B_TR A_BR
                BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
            BLAS(zher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
            if (*lWork >= n2 * n1)
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            else
                // A_TR = A_TR + 1/2 B_TR * A_BR
                BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR * B_BR
            BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }

    // recursion(A_BR, B_BR)
    RELAPACK_zhegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
 }
--- a/relapack/src/zhetrf.c
+++ b/relapack/src/zhetrf.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *,
    double *, const int *, int *, double *, const int *, int *);


 /** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's zhetrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html
 * */
 void RELAPACK_zhetrf(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    double *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(double));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZHETRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** zhetrf's recursive compute kernel */
 static void RELAPACK_zhetrf_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_ZHETRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_zhetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };
    const int    iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = ZREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        double *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        double *const A_BL   = A                 + 2 * n1;
        double *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        double *const A_BL_B = A                 + 2 * *n;
        double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        double *const Work_BL =              Work                    + 2 * n1;
        double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            double *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            double *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = ZREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        double *const A_TL_T = A + 2 * *ldA * n_rest;
        double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        double *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        double *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        double *const Work_L  = Work;
        double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/zhetrf_rec2.c
+++ b/relapack/src/zhetrf_rec2.c
@@ -0,0 +1,524 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static doublecomplex c_b1 = {1.,0.};
 static int c__1 = 1;

 /** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
 *
 * This routine is a minor modification of LAPACK's zlahef.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int *
 	nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
 	doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    double d__1, d__2, d__3, d__4;
    doublecomplex z__1, z__2, z__3, z__4;

    /* Builtin functions */
    double sqrt(double), d_imag(doublecomplex *);
    void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
 	    doublecomplex *, doublecomplex *);

    /* Local variables */
    static int j, k;
    static double t, r1;
    static doublecomplex d11, d21, d22;
    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
    static double alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    static int kstep;
    extern /* Subroutine */ int zgemv_(char *, int *, int *,
 	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
 	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
 	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
 	    int *), zswap_(int *, doublecomplex *, int *,
 	    doublecomplex *, int *);
    static double absakk;
    extern /* Subroutine */ int zdscal_(int *, double *,
 	    doublecomplex *, int *);
    static double colmax;
    extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
 	    ;
    extern int izamax_(int *, doublecomplex *, int *);
    static double rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.) + 1.) / 8.;
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	i__1 = k - 1;
 	zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	i__1 = k + kw * w_dim1;
 	i__2 = k + k * a_dim1;
 	d__1 = a[i__2].r;
 	w[i__1].r = d__1, w[i__1].i = 0.;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    z__1.r = -1., z__1.i = -0.;
 	    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	    i__1 = k + kw * w_dim1;
 	    i__2 = k + kw * w_dim1;
 	    d__1 = w[i__2].r;
 	    w[i__1].r = d__1, w[i__1].i = 0.;
 	}
 	i__1 = k + kw * w_dim1;
 	absakk = (d__1 = w[i__1].r, abs(d__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
 		    kw * w_dim1]), abs(d__2));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + k * a_dim1;
 	    d__1 = a[i__2].r;
 	    a[i__1].r = d__1, a[i__1].i = 0.;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - 1;
 		zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = imax + (kw - 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		d__1 = a[i__2].r;
 		w[i__1].r = d__1, w[i__1].i = 0.;
 		i__1 = k - imax;
 		zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		i__1 = k - imax;
 		zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    z__1.r = -1., z__1.i = -0.;
 		    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    i__2 = imax + (kw - 1) * w_dim1;
 		    d__1 = w[i__2].r;
 		    w[i__1].r = d__1, w[i__1].i = 0.;
 		}
 		i__1 = k - imax;
 		jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
 			 &c__1);
 		i__1 = jmax + (kw - 1) * w_dim1;
 		rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			jmax + (kw - 1) * w_dim1]), abs(d__2));
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
 			    d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
 			    d__2));
 		    rowmax = max(d__3,d__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
 			kp = imax;
 			zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 				w_dim1 + 1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		d__1 = a[i__2].r;
 		a[i__1].r = d__1, a[i__1].i = 0.;
 		i__1 = kk - 1 - kp;
 		zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		i__1 = kk - 1 - kp;
 		zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    i__1 = k + k * a_dim1;
 		    r1 = 1. / a[i__1].r;
 		    i__1 = k - 1;
 		    zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    i__1 = k - 1;
 		    zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		}
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    d_cnjg(&z__2, &d21);
 		    z_div(&z__1, &w[k + kw * w_dim1], &z__2);
 		    d11.r = z__1.r, d11.i = z__1.i;
 		    z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
 		    d22.r = z__1.r, d22.i = z__1.i;
 		    z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1. / (z__1.r - 1.);
 		    z__2.r = t, z__2.i = 0.;
 		    z_div(&z__1, &z__2, &d21);
 		    d21.r = z__1.r, d21.i = z__1.i;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
 				.i;
 			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
 				d21.r * z__2.i + d21.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 			i__2 = j + k * a_dim1;
 			d_cnjg(&z__2, &d21);
 			i__3 = j + kw * w_dim1;
 			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
 				z__2.r * z__3.i + z__2.i * z__3.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1;
 		zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		i__1 = k - 2;
 		zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    ++j;
 	}
 	++j;
 	if (jp != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
 	}
 	if (j < *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	i__1 = k + k * w_dim1;
 	i__2 = k + k * a_dim1;
 	d__1 = a[i__2].r;
 	w[i__1].r = d__1, w[i__1].i = 0.;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
 		    w_dim1], &c__1);
 	}
 	i__1 = *n - k + 1;
 	i__2 = k - 1;
 	z__1.r = -1., z__1.i = -0.;
 	zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
 		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
 	i__1 = k + k * w_dim1;
 	i__2 = k + k * w_dim1;
 	d__1 = w[i__2].r;
 	w[i__1].r = d__1, w[i__1].i = 0.;
 	i__1 = k + k * w_dim1;
 	absakk = (d__1 = w[i__1].r, abs(d__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
 		    k * w_dim1]), abs(d__2));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + k * a_dim1;
 	    d__1 = a[i__2].r;
 	    a[i__1].r = d__1, a[i__1].i = 0.;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - k;
 		zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = imax - k;
 		zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
 		i__1 = imax + (k + 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		d__1 = a[i__2].r;
 		w[i__1].r = d__1, w[i__1].i = 0.;
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
 			    imax + 1 + (k + 1) * w_dim1], &c__1);
 		}
 		i__1 = *n - k + 1;
 		i__2 = k - 1;
 		z__1.r = -1., z__1.i = -0.;
 		zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
 			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
 			w_dim1], &c__1, (ftnlen)12);
 		i__1 = imax + (k + 1) * w_dim1;
 		i__2 = imax + (k + 1) * w_dim1;
 		d__1 = w[i__2].r;
 		w[i__1].r = d__1, w[i__1].i = 0.;
 		i__1 = imax - k;
 		jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
 			;
 		i__1 = jmax + (k + 1) * w_dim1;
 		rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			jmax + (k + 1) * w_dim1]), abs(d__2));
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
 			    d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
 			    d__2));
 		    rowmax = max(d__3,d__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (k + 1) * w_dim1;
 		    if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
 			kp = imax;
 			i__1 = *n - k + 1;
 			zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
 				k * w_dim1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		d__1 = a[i__2].r;
 		a[i__1].r = d__1, a[i__1].i = 0.;
 		i__1 = kp - kk - 1;
 		zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		i__1 = kp - kk - 1;
 		zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    i__1 = k + k * a_dim1;
 		    r1 = 1. / a[i__1].r;
 		    i__1 = *n - k;
 		    zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    i__1 = *n - k;
 		    zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = z__1.r, d11.i = z__1.i;
 		    d_cnjg(&z__2, &d21);
 		    z_div(&z__1, &w[k + k * w_dim1], &z__2);
 		    d22.r = z__1.r, d22.i = z__1.i;
 		    z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1. / (z__1.r - 1.);
 		    z__2.r = t, z__2.i = 0.;
 		    z_div(&z__1, &z__2, &d21);
 		    d21.r = z__1.r, d21.i = z__1.i;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			d_cnjg(&z__2, &d21);
 			i__3 = j + k * w_dim1;
 			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
 				z__2.r * z__3.i + z__2.i * z__3.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
 				.i;
 			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
 				d21.r * z__2.i + d21.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = *n - k;
 		zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		i__1 = *n - k - 1;
 		zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    --j;
 	}
 	--j;
 	if (jp != jj && j >= 1) {
 	    zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j > 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/zhetrf_rook.c
+++ b/relapack/src/zhetrf_rook.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *,
    double *, const int *, int *, double *, const int *, int *);


 /** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's zhetrf_rook.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html
 * */
 void RELAPACK_zhetrf_rook(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    double *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(double));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZHETRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** zhetrf_rook's recursive compute kernel */
 static void RELAPACK_zhetrf_rook_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) {
        // Unblocked
        if (top) {
            LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_zhetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };
    const int    iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = ZREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        double *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        double *const A_BL   = A                 + 2 * n1;
        double *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        double *const A_BL_B = A                 + 2 * *n;
        double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        double *const Work_BL =              Work                    + 2 * n1;
        double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            double *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            double *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = ZREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        double *const A_TL_T = A + 2 * *ldA * n_rest;
        double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        double *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        double *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        double *const Work_L  = Work;
        double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/zhetrf_rook_rec2.c
+++ b/relapack/src/zhetrf_rook_rec2.c
@@ -0,0 +1,662 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static doublecomplex c_b1 = {1.,0.};
 static int c__1 = 1;

 /** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
 *
 * This routine is a minor modification of LAPACK's zlahef_rook.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n,
 	int *nb, int *kb, doublecomplex *a, int *lda, int *
 	ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    double d__1, d__2;
    doublecomplex z__1, z__2, z__3, z__4, z__5;

    /* Builtin functions */
    double sqrt(double), d_imag(doublecomplex *);
    void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
 	    doublecomplex *, doublecomplex *);

    /* Local variables */
    static int j, k, p;
    static double t, r1;
    static doublecomplex d11, d21, d22;
    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
    static logical done;
    static int imax, jmax;
    static double alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    static double dtemp, sfmin;
    static int itemp, kstep;
    extern /* Subroutine */ int zgemv_(char *, int *, int *,
 	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
 	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
 	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
 	    int *), zswap_(int *, doublecomplex *, int *,
 	    doublecomplex *, int *);
    extern double dlamch_(char *, ftnlen);
    static double absakk;
    extern /* Subroutine */ int zdscal_(int *, double *,
 	    doublecomplex *, int *);
    static double colmax;
    extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
 	    ;
    extern int izamax_(int *, doublecomplex *, int *);
    static double rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.) + 1.) / 8.;
    sfmin = dlamch_("S", (ftnlen)1);
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	p = k;
 	if (k > 1) {
 	    i__1 = k - 1;
 	    zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
 		    c__1);
 	}
 	i__1 = k + kw * w_dim1;
 	i__2 = k + k * a_dim1;
 	d__1 = a[i__2].r;
 	w[i__1].r = d__1, w[i__1].i = 0.;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    z__1.r = -1., z__1.i = -0.;
 	    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	    i__1 = k + kw * w_dim1;
 	    i__2 = k + kw * w_dim1;
 	    d__1 = w[i__2].r;
 	    w[i__1].r = d__1, w[i__1].i = 0.;
 	}
 	i__1 = k + kw * w_dim1;
 	absakk = (d__1 = w[i__1].r, abs(d__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
 		    kw * w_dim1]), abs(d__2));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + kw * w_dim1;
 	    d__1 = w[i__2].r;
 	    a[i__1].r = d__1, a[i__1].i = 0.;
 	    if (k > 1) {
 		i__1 = k - 1;
 		zcopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
 			&c__1);
 	    }
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L12:
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			    w_dim1 + 1], &c__1);
 		}
 		i__1 = imax + (kw - 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		d__1 = a[i__2].r;
 		w[i__1].r = d__1, w[i__1].i = 0.;
 		i__1 = k - imax;
 		zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		i__1 = k - imax;
 		zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    z__1.r = -1., z__1.i = -0.;
 		    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    i__2 = imax + (kw - 1) * w_dim1;
 		    d__1 = w[i__2].r;
 		    w[i__1].r = d__1, w[i__1].i = 0.;
 		}
 		if (imax != k) {
 		    i__1 = k - imax;
 		    jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
 			    w_dim1], &c__1);
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
 			    w[jmax + (kw - 1) * w_dim1]), abs(d__2));
 		} else {
 		    rowmax = 0.;
 		}
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 		    i__1 = itemp + (kw - 1) * w_dim1;
 		    dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			    itemp + (kw - 1) * w_dim1]), abs(d__2));
 		    if (dtemp > rowmax) {
 			rowmax = dtemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (kw - 1) * w_dim1;
 		if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
 		    kp = imax;
 		    zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		}
 		if (! done) {
 		    goto L12;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kstep == 2 && p != k) {
 		i__1 = p + p * a_dim1;
 		i__2 = k + k * a_dim1;
 		d__1 = a[i__2].r;
 		a[i__1].r = d__1, a[i__1].i = 0.;
 		i__1 = k - 1 - p;
 		zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
 			a_dim1], lda);
 		i__1 = k - 1 - p;
 		zlacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
 		if (p > 1) {
 		    i__1 = p - 1;
 		    zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
 			    1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    zswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
 			    1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
 			 ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		d__1 = a[i__2].r;
 		a[i__1].r = d__1, a[i__1].i = 0.;
 		i__1 = kk - 1 - kp;
 		zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		i__1 = kk - 1 - kp;
 		zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    i__1 = k + k * a_dim1;
 		    t = a[i__1].r;
 		    if (abs(t) >= sfmin) {
 			r1 = 1. / t;
 			i__1 = k - 1;
 			zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    } else {
 			i__1 = k - 1;
 			for (ii = 1; ii <= i__1; ++ii) {
 			    i__2 = ii + k * a_dim1;
 			    i__3 = ii + k * a_dim1;
 			    z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
 			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L14: */
 			}
 		    }
 		    i__1 = k - 1;
 		    zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		}
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    d_cnjg(&z__2, &d21);
 		    z_div(&z__1, &w[k + kw * w_dim1], &z__2);
 		    d11.r = z__1.r, d11.i = z__1.i;
 		    z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
 		    d22.r = z__1.r, d22.i = z__1.i;
 		    z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1. / (z__1.r - 1.);
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			z_div(&z__2, &z__3, &d21);
 			z__1.r = t * z__2.r, z__1.i = t * z__2.i;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 			i__2 = j + k * a_dim1;
 			i__3 = j + kw * w_dim1;
 			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			d_cnjg(&z__5, &d21);
 			z_div(&z__2, &z__3, &z__5);
 			z__1.r = t * z__2.r, z__1.i = t * z__2.i;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1;
 		zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		i__1 = k - 2;
 		zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    ++j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	++j;
 	if (jp2 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	++jj;
 	if (kstep == 2 && jp1 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	if (j < *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	p = k;
 	i__1 = k + k * w_dim1;
 	i__2 = k + k * a_dim1;
 	d__1 = a[i__2].r;
 	w[i__1].r = d__1, w[i__1].i = 0.;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
 		    w_dim1], &c__1);
 	}
 	if (k > 1) {
 	    i__1 = *n - k + 1;
 	    i__2 = k - 1;
 	    z__1.r = -1., z__1.i = -0.;
 	    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
 		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
 		    ftnlen)12);
 	    i__1 = k + k * w_dim1;
 	    i__2 = k + k * w_dim1;
 	    d__1 = w[i__2].r;
 	    w[i__1].r = d__1, w[i__1].i = 0.;
 	}
 	i__1 = k + k * w_dim1;
 	absakk = (d__1 = w[i__1].r, abs(d__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
 		    k * w_dim1]), abs(d__2));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + k * w_dim1;
 	    d__1 = w[i__2].r;
 	    a[i__1].r = d__1, a[i__1].i = 0.;
 	    if (k < *n) {
 		i__1 = *n - k;
 		zcopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
 			a_dim1], &c__1);
 	    }
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L72:
 		i__1 = imax - k;
 		zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = imax - k;
 		zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
 		i__1 = imax + (k + 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		d__1 = a[i__2].r;
 		w[i__1].r = d__1, w[i__1].i = 0.;
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
 			    imax + 1 + (k + 1) * w_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = *n - k + 1;
 		    i__2 = k - 1;
 		    z__1.r = -1., z__1.i = -0.;
 		    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
 			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
 			    1) * w_dim1], &c__1, (ftnlen)12);
 		    i__1 = imax + (k + 1) * w_dim1;
 		    i__2 = imax + (k + 1) * w_dim1;
 		    d__1 = w[i__2].r;
 		    w[i__1].r = d__1, w[i__1].i = 0.;
 		}
 		if (imax != k) {
 		    i__1 = imax - k;
 		    jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
 			    c__1);
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
 			    w[jmax + (k + 1) * w_dim1]), abs(d__2));
 		} else {
 		    rowmax = 0.;
 		}
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 		    i__1 = itemp + (k + 1) * w_dim1;
 		    dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			    itemp + (k + 1) * w_dim1]), abs(d__2));
 		    if (dtemp > rowmax) {
 			rowmax = dtemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (k + 1) * w_dim1;
 		if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    i__1 = *n - k + 1;
 		    zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		}
 		if (! done) {
 		    goto L72;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kstep == 2 && p != k) {
 		i__1 = p + p * a_dim1;
 		i__2 = k + k * a_dim1;
 		d__1 = a[i__2].r;
 		a[i__1].r = d__1, a[i__1].i = 0.;
 		i__1 = p - k - 1;
 		zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
 			a_dim1], lda);
 		i__1 = p - k - 1;
 		zlacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
 		if (p < *n) {
 		    i__1 = *n - p;
 		    zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
 			    * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    zswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
 		}
 		zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		d__1 = a[i__2].r;
 		a[i__1].r = d__1, a[i__1].i = 0.;
 		i__1 = kp - kk - 1;
 		zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		i__1 = kp - kk - 1;
 		zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    i__1 = k + k * a_dim1;
 		    t = a[i__1].r;
 		    if (abs(t) >= sfmin) {
 			r1 = 1. / t;
 			i__1 = *n - k;
 			zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    } else {
 			i__1 = *n;
 			for (ii = k + 1; ii <= i__1; ++ii) {
 			    i__2 = ii + k * a_dim1;
 			    i__3 = ii + k * a_dim1;
 			    z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
 			    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L74: */
 			}
 		    }
 		    i__1 = *n - k;
 		    zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = z__1.r, d11.i = z__1.i;
 		    d_cnjg(&z__2, &d21);
 		    z_div(&z__1, &w[k + k * w_dim1], &z__2);
 		    d22.r = z__1.r, d22.i = z__1.i;
 		    z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1. / (z__1.r - 1.);
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			i__3 = j + k * w_dim1;
 			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			d_cnjg(&z__5, &d21);
 			z_div(&z__2, &z__3, &z__5);
 			z__1.r = t * z__2.r, z__1.i = t * z__2.i;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			z_div(&z__2, &z__3, &d21);
 			z__1.r = t * z__2.r, z__1.i = t * z__2.i;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = *n - k;
 		zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		i__1 = *n - k - 1;
 		zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    --j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	--j;
 	if (jp2 != jj && j >= 1) {
 	    zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	--jj;
 	if (kstep == 2 && jp1 != jj && j >= 1) {
 	    zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j > 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/zlauum.c
+++ b/relapack/src/zlauum.c
@@ -0,0 +1,87 @@
 #include "relapack.h"

 static void RELAPACK_zlauum_rec(const char *, const int *, double *,
    const int *, int *);


 /** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
 *
 * This routine is functionally equivalent to LAPACK's zlauum.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html
 * */
 void RELAPACK_zlauum(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZLAUUM", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_zlauum_rec(&cleanuplo, n, A, ldA, info);
 }


 /** zlauum's recursive compute kernel */
 static void RELAPACK_zlauum_rec(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) {
        // Unblocked
        LAPACK(zlauu2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const double ONE[] = { 1., 0. };

    // Splitting
    const int n1 = ZREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + 2 * *ldA * n1;
    double *const A_BL = A                 + 2 * n1;
    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_zlauum_rec(uplo, &n1, A_TL, ldA, info);

    if (*uplo == 'L') {
        // A_TL = A_TL + A_BL' * A_BL
        BLAS(zherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
        // A_BL = A_BR' * A_BL
        BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TL = A_TL + A_TR * A_TR'
        BLAS(zherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
        // A_TR = A_TR * A_BR'
        BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_zlauum_rec(uplo, &n2, A_BR, ldA, info);
 }
--- a/relapack/src/zpbtrf.c
+++ b/relapack/src/zpbtrf.c
@@ -0,0 +1,157 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *,
    double *, const int *, double *, const int *, int *);


 /** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
 *
 * This routine is functionally equivalent to LAPACK's zpbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html
 * */
 void RELAPACK_zpbtrf(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kd < 0)
        *info = -3;
    else if (*ldAb < *kd + 1)
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZPBTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Constant
    const double ZERO[] = { 0., 0. };

    // Allocate work space
    const int n1 = ZREC_SPLIT(*n);
    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
    double *Work = malloc(mWork * nWork * 2 * sizeof(double));
    LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);

    // Recursive kernel
    RELAPACK_zpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);

    // Free work space
    free(Work);
 }


 /** zpbtrf's recursive compute kernel */
 static void RELAPACK_zpbtrf_rec(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    double *Work, const int *ldWork,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) {
        // Unblocked
        LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);

    // Splitting
    const int n1 = MIN(ZREC_SPLIT(*n), *kd);
    const int n2 = *n - n1;

    // * *
    // * Ab_BR
    double *const Ab_BR = Ab + 2 * *ldAb * n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + 2 * *ldA * n1;
    double *const A_BL = A                 + 2 * n1;
    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_zpotrf(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    // Banded splitting
    const int n21 = MIN(n2, *kd - n1);
    const int n22 = MIN(n2 - n21, *kd);

    //     n1    n21    n22
    // n1  *     A_TRl  A_TRr
    // n21 A_BLt A_BRtl A_BRtr
    // n22 A_BLb A_BRbl A_BRbr
    double *const A_TRl  = A_TR;
    double *const A_TRr  = A_TR + 2 * *ldA * n21;
    double *const A_BLt  = A_BL;
    double *const A_BLb  = A_BL                   + 2 * n21;
    double *const A_BRtl = A_BR;
    double *const A_BRtr = A_BR + 2 * *ldA * n21;
    double *const A_BRbl = A_BR                   + 2 * n21;
    double *const A_BRbr = A_BR + 2 * *ldA * n21  + 2 * n21;

    if (*uplo == 'L') {
        // A_BLt = ABLt / A_TL'
        BLAS(ztrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
        // A_BRtl = A_BRtl - A_BLt * A_BLt'
        BLAS(zherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
        // Work = A_BLb
        LAPACK(zlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
        // Work = Work / A_TL'
        BLAS(ztrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
        // A_BRbl = A_BRbl - Work * A_BLt'
        BLAS(zgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
        // A_BRbr = A_BRbr - Work * Work'
        BLAS(zherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_BLb = Work
        LAPACK(zlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
    } else {
        // A_TRl = A_TL' \ A_TRl
        BLAS(ztrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
        // A_BRtl = A_BRtl - A_TRl' * A_TRl
        BLAS(zherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
        // Work = A_TRr
        LAPACK(zlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
        // Work = A_TL' \ Work
        BLAS(ztrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
        // A_BRtr = A_BRtr - A_TRl' * Work
        BLAS(zgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
        // A_BRbr = A_BRbr - Work' * Work
        BLAS(zherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_TRr = Work
        LAPACK(zlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
    }

    // recursion(A_BR)
    if (*kd > n1)
        RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info);
    else
        RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/zpotrf.c
+++ b/relapack/src/zpotrf.c
@@ -0,0 +1,92 @@
 #include "relapack.h"

 static void RELAPACK_zpotrf_rec(const char *, const int *, double *,
        const int *, int *);


 /** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
 *
 * This routine is functionally equivalent to LAPACK's zpotrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html
 * */
 void RELAPACK_zpotrf(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZPOTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_zpotrf_rec(&cleanuplo, n, A, ldA, info);
 }


 /** zpotrf's recursive compute kernel */
 static void RELAPACK_zpotrf_rec(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) {
        // Unblocked
        LAPACK(zpotf2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };

    // Splitting
    const int n1 = ZREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + 2 * *ldA * n1;
    double *const A_BL = A                 + 2 * n1;
    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_zpotrf_rec(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = A_BL / A_TL'
        BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
        // A_BR = A_BR - A_BL * A_BL'
        BLAS(zherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
    } else {
        // A_TR = A_TL' \ A_TR
        BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
        // A_BR = A_BR - A_TR' * A_TR
        BLAS(zherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_zpotrf_rec(uplo, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/zsytrf.c
+++ b/relapack/src/zsytrf.c
@@ -0,0 +1,238 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *,
    double *, const int *, int *, double *, const int *, int *);


 /** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's zsytrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html
 * */
 void RELAPACK_zsytrf(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    double *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(double));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy arguments
    int nout;

    // Recursive kernel
    RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** zsytrf's recursive compute kernel */
 static void RELAPACK_zsytrf_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_zsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };
    const int    iONE[] = { 1 };

    // Loop iterator
    int i;

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = ZREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        double *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        double *const A_BL   = A                 + 2 * n1;
        double *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        double *const A_BL_B = A                 + 2 * *n;
        double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        double *const Work_BL =              Work                    + 2 * n1;
        double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            double *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            double *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = ZREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1  = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        double *const A_TL_T = A + 2 * *ldA * n_rest;
        double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        double *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        double *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        double *const Work_L  = Work;
        double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/zsytrf_rec2.c
+++ b/relapack/src/zsytrf_rec2.c
@@ -0,0 +1,452 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static doublecomplex c_b1 = {1.,0.};
 static int c__1 = 1;

 /** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
 *
 * This routine is a minor modification of LAPACK's zlasyf.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int *
 	nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
 	doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    double d__1, d__2, d__3, d__4;
    doublecomplex z__1, z__2, z__3;

    /* Builtin functions */
    double sqrt(double), d_imag(doublecomplex *);
    void z_div(doublecomplex *, doublecomplex *, doublecomplex *);

    /* Local variables */
    static int j, k;
    static doublecomplex t, r1, d11, d21, d22;
    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
    static double alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int zscal_(int *, doublecomplex *,
 	    doublecomplex *, int *);
    static int kstep;
    extern /* Subroutine */ int zgemv_(char *, int *, int *,
 	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
 	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
 	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
 	    int *), zswap_(int *, doublecomplex *, int *,
 	    doublecomplex *, int *);
    static double absakk, colmax;
    extern int izamax_(int *, doublecomplex *, int *);
    static double rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.) + 1.) / 8.;
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    z__1.r = -1., z__1.i = -0.;
 	    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	kstep = 1;
 	i__1 = k + kw * w_dim1;
 	absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
 		w_dim1]), abs(d__2));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
 		    kw * w_dim1]), abs(d__2));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    z__1.r = -1., z__1.i = -0.;
 		    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		i__1 = k - imax;
 		jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
 			 &c__1);
 		i__1 = jmax + (kw - 1) * w_dim1;
 		rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			jmax + (kw - 1) * w_dim1]), abs(d__2));
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
 			    d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
 			    d__2));
 		    rowmax = max(d__3,d__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			    imax + (kw - 1) * w_dim1]), abs(d__2)) >= alpha *
 			    rowmax) {
 			kp = imax;
 			zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 				w_dim1 + 1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = kk - 1 - kp;
 		zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
 		r1.r = z__1.r, r1.i = z__1.i;
 		i__1 = k - 1;
 		zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    z_div(&z__1, &w[k + kw * w_dim1], &d21);
 		    d11.r = z__1.r, d11.i = z__1.i;
 		    z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
 		    d22.r = z__1.r, d22.i = z__1.i;
 		    z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
 		    z_div(&z__1, &c_b1, &z__2);
 		    t.r = z__1.r, t.i = z__1.i;
 		    z_div(&z__1, &t, &d21);
 		    d21.r = z__1.r, d21.i = z__1.i;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
 				.i;
 			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
 				d21.r * z__2.i + d21.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 			i__2 = j + k * a_dim1;
 			i__3 = j + kw * w_dim1;
 			z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
 				.i;
 			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
 				d21.r * z__2.i + d21.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    ++j;
 	}
 	++j;
 	if (jp != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
 	}
 	if (j < *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	i__1 = *n - k + 1;
 	zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	i__1 = *n - k + 1;
 	i__2 = k - 1;
 	z__1.r = -1., z__1.i = -0.;
 	zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
 		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
 	kstep = 1;
 	i__1 = k + k * w_dim1;
 	absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
 		w_dim1]), abs(d__2));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
 		    k * w_dim1]), abs(d__2));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - k;
 		zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		i__1 = *n - k + 1;
 		i__2 = k - 1;
 		z__1.r = -1., z__1.i = -0.;
 		zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
 			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
 			w_dim1], &c__1, (ftnlen)12);
 		i__1 = imax - k;
 		jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
 			;
 		i__1 = jmax + (k + 1) * w_dim1;
 		rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			jmax + (k + 1) * w_dim1]), abs(d__2));
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
 			    d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
 			    d__2));
 		    rowmax = max(d__3,d__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (k + 1) * w_dim1;
 		    if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			    imax + (k + 1) * w_dim1]), abs(d__2)) >= alpha *
 			    rowmax) {
 			kp = imax;
 			i__1 = *n - k + 1;
 			zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
 				k * w_dim1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = kp - kk - 1;
 		zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
 		    r1.r = z__1.r, r1.i = z__1.i;
 		    i__1 = *n - k;
 		    zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = z__1.r, d11.i = z__1.i;
 		    z_div(&z__1, &w[k + k * w_dim1], &d21);
 		    d22.r = z__1.r, d22.i = z__1.i;
 		    z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
 		    z_div(&z__1, &c_b1, &z__2);
 		    t.r = z__1.r, t.i = z__1.i;
 		    z_div(&z__1, &t, &d21);
 		    d21.r = z__1.r, d21.i = z__1.i;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			i__3 = j + k * w_dim1;
 			z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
 				.i;
 			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
 				d21.r * z__2.i + d21.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
 				.i;
 			z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
 				d21.r * z__2.i + d21.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    --j;
 	}
 	--j;
 	if (jp != jj && j >= 1) {
 	    zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j > 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/zsytrf_rook.c
+++ b/relapack/src/zsytrf_rook.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *,
    double *, const int *, int *, double *, const int *, int *);


 /** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's zsytrf_rook.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html
 * */
 void RELAPACK_zsytrf_rook(
    const char *uplo, const int *n,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    double *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(double));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** zsytrf_rook's recursive compute kernel */
 static void RELAPACK_zsytrf_rook_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    double *A, const int *ldA, int *ipiv,
    double *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) {
        // Unblocked
        if (top) {
            LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_zsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };
    const int    iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = ZREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        double *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2   = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        double *const A_BL   = A                 + 2 * n1;
        double *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        double *const A_BL_B = A                 + 2 * *n;
        double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        double *const Work_BL =              Work                    + 2 * n1;
        double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            double *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            double *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = ZREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        double *const A_TL_T = A + 2 * *ldA * n_rest;
        double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        double *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        double *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        double *const Work_L  = Work;
        double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/zsytrf_rook_rec2.c
+++ b/relapack/src/zsytrf_rook_rec2.c
@@ -0,0 +1,561 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static doublecomplex c_b1 = {1.,0.};
 static int c__1 = 1;

 /** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
 *
 * This routine is a minor modification of LAPACK's zlasyf_rook.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n,
 	int *nb, int *kb, doublecomplex *a, int *lda, int *
 	ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    double d__1, d__2;
    doublecomplex z__1, z__2, z__3, z__4;

    /* Builtin functions */
    double sqrt(double), d_imag(doublecomplex *);
    void z_div(doublecomplex *, doublecomplex *, doublecomplex *);

    /* Local variables */
    static int j, k, p;
    static doublecomplex t, r1, d11, d12, d21, d22;
    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
    static logical done;
    static int imax, jmax;
    static double alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    static double dtemp, sfmin;
    extern /* Subroutine */ int zscal_(int *, doublecomplex *,
 	    doublecomplex *, int *);
    static int itemp, kstep;
    extern /* Subroutine */ int zgemv_(char *, int *, int *,
 	    doublecomplex *, doublecomplex *, int *, doublecomplex *,
 	    int *, doublecomplex *, doublecomplex *, int *, ftnlen),
 	    zcopy_(int *, doublecomplex *, int *, doublecomplex *,
 	    int *), zswap_(int *, doublecomplex *, int *,
 	    doublecomplex *, int *);
    extern double dlamch_(char *, ftnlen);
    static double absakk, colmax;
    extern int izamax_(int *, doublecomplex *, int *);
    static double rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.) + 1.) / 8.;
    sfmin = dlamch_("S", (ftnlen)1);
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	p = k;
 	zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    z__1.r = -1., z__1.i = -0.;
 	    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	i__1 = k + kw * w_dim1;
 	absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
 		w_dim1]), abs(d__2));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
 		    kw * w_dim1]), abs(d__2));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L12:
 		zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    z__1.r = -1., z__1.i = -0.;
 		    zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = k - imax;
 		    jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
 			    w_dim1], &c__1);
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
 			    w[jmax + (kw - 1) * w_dim1]), abs(d__2));
 		} else {
 		    rowmax = 0.;
 		}
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 		    i__1 = itemp + (kw - 1) * w_dim1;
 		    dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			    itemp + (kw - 1) * w_dim1]), abs(d__2));
 		    if (dtemp > rowmax) {
 			rowmax = dtemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (kw - 1) * w_dim1;
 		if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
 			+ (kw - 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
 		    kp = imax;
 		    zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		}
 		if (! done) {
 		    goto L12;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kstep == 2 && p != k) {
 		i__1 = k - p;
 		zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
 			a_dim1], lda);
 		zcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - k + 1;
 		zswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - kk + 1;
 		zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
 			 ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + k * a_dim1;
 		i__2 = kk + k * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = k - 1 - kp;
 		zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		zcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - kk + 1;
 		zswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
 			 lda);
 		i__1 = *n - kk + 1;
 		zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    i__1 = k + k * a_dim1;
 		    if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
 			    k * a_dim1]), abs(d__2)) >= sfmin) {
 			z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
 			r1.r = z__1.r, r1.i = z__1.i;
 			i__1 = k - 1;
 			zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    } else /* if(complicated condition) */ {
 			i__1 = k + k * a_dim1;
 			if (a[i__1].r != 0. || a[i__1].i != 0.) {
 			    i__1 = k - 1;
 			    for (ii = 1; ii <= i__1; ++ii) {
 				i__2 = ii + k * a_dim1;
 				z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
 					a_dim1]);
 				a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L14: */
 			    }
 			}
 		    }
 		}
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d12.r = w[i__1].r, d12.i = w[i__1].i;
 		    z_div(&z__1, &w[k + kw * w_dim1], &d12);
 		    d11.r = z__1.r, d11.i = z__1.i;
 		    z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
 		    d22.r = z__1.r, d22.i = z__1.i;
 		    z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
 		    z_div(&z__1, &c_b1, &z__2);
 		    t.r = z__1.r, t.i = z__1.i;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			z_div(&z__2, &z__3, &d12);
 			z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
 				z__2.i + t.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 			i__2 = j + k * a_dim1;
 			i__3 = j + kw * w_dim1;
 			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			z_div(&z__2, &z__3, &d12);
 			z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
 				z__2.i + t.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    ++j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	++j;
 	if (jp2 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	jj = j - 1;
 	if (jp1 != jj && kstep == 2) {
 	    i__1 = *n - j + 1;
 	    zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	if (j <= *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	p = k;
 	i__1 = *n - k + 1;
 	zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	if (k > 1) {
 	    i__1 = *n - k + 1;
 	    i__2 = k - 1;
 	    z__1.r = -1., z__1.i = -0.;
 	    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
 		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
 		    ftnlen)12);
 	}
 	i__1 = k + k * w_dim1;
 	absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
 		w_dim1]), abs(d__2));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
 		    k * w_dim1]), abs(d__2));
 	} else {
 	    colmax = 0.;
 	}
 	if (max(absakk,colmax) == 0.) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = *n - k + 1;
 	    zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 		    c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L72:
 		i__1 = imax - k;
 		zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		if (k > 1) {
 		    i__1 = *n - k + 1;
 		    i__2 = k - 1;
 		    z__1.r = -1., z__1.i = -0.;
 		    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
 			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
 			    1) * w_dim1], &c__1, (ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = imax - k;
 		    jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
 			    c__1);
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
 			    w[jmax + (k + 1) * w_dim1]), abs(d__2));
 		} else {
 		    rowmax = 0.;
 		}
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 		    i__1 = itemp + (k + 1) * w_dim1;
 		    dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
 			    itemp + (k + 1) * w_dim1]), abs(d__2));
 		    if (dtemp > rowmax) {
 			rowmax = dtemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (k + 1) * w_dim1;
 		if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
 			+ (k + 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    i__1 = *n - k + 1;
 		    zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		}
 		if (! done) {
 		    goto L72;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kstep == 2 && p != k) {
 		i__1 = p - k;
 		zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - p + 1;
 		zcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
 			c__1);
 		zswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
 		zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + k * a_dim1;
 		i__2 = kk + k * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = kp - k - 1;
 		zcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
 			* a_dim1], lda);
 		i__1 = *n - kp + 1;
 		zcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
 			a_dim1], &c__1);
 		zswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    i__1 = k + k * a_dim1;
 		    if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
 			    k * a_dim1]), abs(d__2)) >= sfmin) {
 			z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
 			r1.r = z__1.r, r1.i = z__1.i;
 			i__1 = *n - k;
 			zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    } else /* if(complicated condition) */ {
 			i__1 = k + k * a_dim1;
 			if (a[i__1].r != 0. || a[i__1].i != 0.) {
 			    i__1 = *n;
 			    for (ii = k + 1; ii <= i__1; ++ii) {
 				i__2 = ii + k * a_dim1;
 				z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
 					a_dim1]);
 				a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L74: */
 			    }
 			}
 		    }
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = z__1.r, d11.i = z__1.i;
 		    z_div(&z__1, &w[k + k * w_dim1], &d21);
 		    d22.r = z__1.r, d22.i = z__1.i;
 		    z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
 		    z_div(&z__1, &c_b1, &z__2);
 		    t.r = z__1.r, t.i = z__1.i;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			i__3 = j + k * w_dim1;
 			z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			z_div(&z__2, &z__3, &d21);
 			z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
 				z__2.i + t.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
 				.i;
 			z_div(&z__2, &z__3, &d21);
 			z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
 				z__2.i + t.i * z__2.r;
 			a[i__2].r = z__1.r, a[i__2].i = z__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    --j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	--j;
 	if (jp2 != jj && j >= 1) {
 	    zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	jj = j + 1;
 	if (jp1 != jj && kstep == 2) {
 	    zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j >= 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/ztgsyl.c
+++ b/relapack/src/ztgsyl.c
@@ -0,0 +1,268 @@
 #include "relapack.h"
 #include <math.h>

 static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *,
    const int *, const double *, const int *, const double *, const int *,
    double *, const int *, const double *, const int *, const double *,
    const int *, double *, const int *, double *, double *, double *, int *);


 /** ZTGSYL solves the generalized Sylvester equation.
 *
 * This routine is functionally equivalent to LAPACK's ztgsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html
 * */
 void RELAPACK_ztgsyl(
    const char *trans, const int *ijob, const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC,
    const double *D, const int *ldD, const double *E, const int *ldE,
    double *F, const int *ldF,
    double *scale, double *dif,
    double *Work, const int *lWork, int *iWork, int *info
 ) {

    // Parse arguments
    const int notran = LAPACK(lsame)(trans, "N");
    const int tran = LAPACK(lsame)(trans, "C");

    // Compute work buffer size
    int lwmin = 1;
    if (notran && (*ijob == 1 || *ijob == 2))
        lwmin = MAX(1, 2 * *m * *n);
    *info = 0;

    // Check arguments
    if (!tran && !notran)
        *info = -1;
    else if (notran && (*ijob < 0 || *ijob > 4))
        *info = -2;
    else if (*m <= 0)
        *info = -3;
    else if (*n <= 0)
        *info = -4;
    else if (*ldA < MAX(1, *m))
        *info = -6;
    else if (*ldB < MAX(1, *n))
        *info = -8;
    else if (*ldC < MAX(1, *m))
        *info = -10;
    else if (*ldD < MAX(1, *m))
        *info = -12;
    else if (*ldE < MAX(1, *n))
        *info = -14;
    else if (*ldF < MAX(1, *m))
        *info = -16;
    else if (*lWork < lwmin && *lWork != -1)
        *info = -20;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZTGSYL", &minfo);
        return;
    }

    if (*lWork == -1) {
        // Work size query
        *Work = lwmin;
        return;
    }

    // Clean char * arguments
    const char cleantrans = notran ? 'N' : 'C';

    // Constant
    const double ZERO[] = { 0., 0. };

    int isolve = 1;
    int ifunc  = 0;
    if (notran) {
        if (*ijob >= 3) {
            ifunc = *ijob - 2;
            LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
            LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
        } else if (*ijob >= 1)
            isolve = 2;
    }

    double scale2;
    int iround;
    for (iround = 1; iround <= isolve; iround++) {
        *scale = 1;
        double dscale = 0;
        double dsum   = 1;
        RELAPACK_ztgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
        if (dscale != 0) {
            if (*ijob == 1 || *ijob == 3)
                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
            else
                *dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
        }
        if (isolve == 2) {
            if (iround == 1) {
                if (notran)
                    ifunc = *ijob;
                scale2 = *scale;
                LAPACK(zlacpy)("F", m, n, C, ldC, Work, m);
                LAPACK(zlacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
                LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
                LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
            } else {
                LAPACK(zlacpy)("F", m, n, Work, m, C, ldC);
                LAPACK(zlacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
                *scale = scale2;
            }
        }
    }
 }


 /** ztgsyl's recursive vompute kernel */
 static void RELAPACK_ztgsyl_rec(
    const char *trans, const int *ifunc, const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC,
    const double *D, const int *ldD, const double *E, const int *ldE,
    double *F, const int *ldF,
    double *scale, double *dsum, double *dscale,
    int *info
 ) {

    if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) {
        // Unblocked
        LAPACK(ztgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };
    const int    iONE[] = { 1 };

    // Outputs
    double scale1[] = { 1., 0. };
    double scale2[] = { 1., 0. };
    int    info1[]  = { 0 };
    int    info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        const int m1 = ZREC_SPLIT(*m);
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const double *const A_TL = A;
        const double *const A_TR = A + 2 * *ldA * m1;
        const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;

        // C_T
        // C_B
        double *const C_T = C;
        double *const C_B = C + 2 * m1;

        // D_TL D_TR
        // 0    D_BR
        const double *const D_TL = D;
        const double *const D_TR = D + 2 * *ldD * m1;
        const double *const D_BR = D + 2 * *ldD * m1 + 2 * m1;

        // F_T
        // F_B
        double *const F_T = F;
        double *const F_B = F + 2 * m1;

        if (*trans == 'N') {
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // F_T = F_T - D_TR * C_B
            BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
            }
        } else {
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
            // C_B = C_B - A_TR^H * C_T
            BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // C_B = C_B - D_TR^H * F_T
            BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
            }
        }
    } else {
        // Splitting
        const int n1 = ZREC_SPLIT(*n);
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const double *const B_TL = B;
        const double *const B_TR = B + 2 * *ldB * n1;
        const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;

        // C_L C_R
        double *const C_L = C;
        double *const C_R = C + 2 * *ldC * n1;

        // E_TL E_TR
        // 0    E_BR
        const double *const E_TL = E;
        const double *const E_TR = E + 2 * *ldE * n1;
        const double *const E_BR = E + 2 * *ldE * n1 + 2 * n1;

        // F_L F_R
        double *const F_L = F;
        double *const F_R = F + 2 * *ldF * n1;

        if (*trans == 'N') {
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
            // C_R = C_R + F_L * B_TR
            BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
            // F_R = F_R + F_L * E_TR
            BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
            }
        } else {
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
            // F_L = F_L + C_R * B_TR
            BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
            // F_L = F_L + F_R * E_TR
            BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
            }
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/ztrsyl.c
+++ b/relapack/src/ztrsyl.c
@@ -0,0 +1,163 @@
 #include "relapack.h"

 static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *,
    const int *, const int *, const double *, const int *, const double *,
    const int *, double *, const int *, double *, int *);


 /** ZTRSYL solves the complex Sylvester matrix equation.
 *
 * This routine is functionally equivalent to LAPACK's ztrsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html
 * */
 void RELAPACK_ztrsyl(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC, double *scale,
    int *info
 ) {

    // Check arguments
    const int notransA = LAPACK(lsame)(tranA, "N");
    const int ctransA = LAPACK(lsame)(tranA, "C");
    const int notransB = LAPACK(lsame)(tranB, "N");
    const int ctransB = LAPACK(lsame)(tranB, "C");
    *info = 0;
    if (!ctransA && !notransA)
        *info = -1;
    else if (!ctransB && !notransB)
        *info = -2;
    else if (*isgn != 1 && *isgn != -1)
        *info = -3;
    else if (*m < 0)
        *info = -4;
    else if (*n < 0)
        *info = -5;
    else if (*ldA < MAX(1, *m))
        *info = -7;
    else if (*ldB < MAX(1, *n))
        *info = -9;
    else if (*ldC < MAX(1, *m))
        *info = -11;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZTRSYL", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleantranA = notransA ? 'N' : 'C';
    const char cleantranB = notransB ? 'N' : 'C';

    // Recursive kernel
    RELAPACK_ztrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }


 /** ztrsyl's recursive compute kernel */
 static void RELAPACK_ztrsyl_rec(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const double *A, const int *ldA, const double *B, const int *ldB,
    double *C, const int *ldC, double *scale,
    int *info
 ) {

    if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) {
        // Unblocked
        RELAPACK_ztrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1., 0. };
    const double MONE[] = { -1., 0. };
    const double MSGN[] = { -*isgn, 0. };
    const int    iONE[] = { 1 };

    // Outputs
    double scale1[] = { 1., 0. };
    double scale2[] = { 1., 0. };
    int    info1[]  = { 0 };
    int    info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        const int m1 = ZREC_SPLIT(*m);
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const double *const A_TL = A;
        const double *const A_TR = A + 2 * *ldA * m1;
        const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;

        // C_T
        // C_B
        double *const C_T = C;
        double *const C_B = C + 2 * m1;

        if (*tranA == 'N') {
            // recusion(A_BR, B, C_B)
            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // recusion(A_TL, B, C_T)
            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
        } else {
            // recusion(A_TL, B, C_T)
            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
            // C_B = C_B - A_TR' * C_T
            BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // recusion(A_BR, B, C_B)
            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
        }
    } else {
        // Splitting
        const int n1 = ZREC_SPLIT(*n);
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const double *const B_TL = B;
        const double *const B_TR = B + 2 * *ldB * n1;
        const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;

        // C_L C_R
        double *const C_L = C;
        double *const C_R = C + 2 * *ldC * n1;

        if (*tranB == 'N') {
            // recusion(A, B_TL, C_L)
            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
            // C_R = C_R -/+ C_L * B_TR
            BLAS(zgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
            // recusion(A, B_BR, C_R)
            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
        } else {
            // recusion(A, B_BR, C_R)
            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
            // C_L = C_L -/+ C_R * B_TR'
            BLAS(zgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
            // recusion(A, B_TL, C_L)
            RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/ztrsyl_rec2.c
+++ b/relapack/src/ztrsyl_rec2.c
@@ -0,0 +1,394 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "../config.h"
 #include "f2c.h"

 #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
 doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
    extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
    doublecomplex result;
    zdotu_(&result, n, x, incx, y, incy);
    return result;
 }
 #define zdotu_ zdotu_fun

 doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
    extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
    doublecomplex result;
    zdotc_(&result, n, x, incx, y, incy);
    return result;
 }
 #define zdotc_ zdotc_fun
 #endif

 #if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
 doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) {
    extern void zladiv_(doublecomplex *, doublecomplex *, doublecomplex *);
    doublecomplex result;
    zladiv_(&result, a, b);
    return result;
 }
 #define zladiv_ zladiv_fun
 #endif

 /* Table of constant values */

 static int c__1 = 1;

 /** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
 *
 * This routine is an exact copy of LAPACK's ztrsyl.
 * It serves as an unblocked kernel in the recursive algorithms.
 * */
 /* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int
 	*isgn, int *m, int *n, doublecomplex *a, int *lda,
 	doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc,
 	double *scale, int *info, ftnlen trana_len, ftnlen tranb_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4;
    double d__1, d__2;
    doublecomplex z__1, z__2, z__3, z__4;

    /* Builtin functions */
    double d_imag(doublecomplex *);
    void d_cnjg(doublecomplex *, doublecomplex *);

    /* Local variables */
    static int j, k, l;
    static doublecomplex a11;
    static double db;
    static doublecomplex x11;
    static double da11;
    static doublecomplex vec;
    static double dum[1], eps, sgn, smin;
    static doublecomplex suml, sumr;
    extern int lsame_(char *, char *, ftnlen, ftnlen);
    /* Double Complex */ doublecomplex zdotc_(int *,
 	    doublecomplex *, int *, doublecomplex *, int *), zdotu_(
 	    int *, doublecomplex *, int *,
 	    doublecomplex *, int *);
    extern /* Subroutine */ int dlabad_(double *, double *);
    extern double dlamch_(char *, ftnlen);
    static double scaloc;
    extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
    extern double zlange_(char *, int *, int *, doublecomplex *,
 	    int *, double *, ftnlen);
    static double bignum;
    extern /* Subroutine */ int zdscal_(int *, double *,
 	    doublecomplex *, int *);
    /* Double Complex */ doublecomplex zladiv_(doublecomplex *,
 	     doublecomplex *);
    static int notrna, notrnb;
    static double smlnum;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1;
    b -= b_offset;
    c_dim1 = *ldc;
    c_offset = 1 + c_dim1;
    c__ -= c_offset;

    /* Function Body */
    notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
    notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
    *info = 0;
    if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
 	*info = -1;
    } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
 	*info = -2;
    } else if (*isgn != 1 && *isgn != -1) {
 	*info = -3;
    } else if (*m < 0) {
 	*info = -4;
    } else if (*n < 0) {
 	*info = -5;
    } else if (*lda < max(1,*m)) {
 	*info = -7;
    } else if (*ldb < max(1,*n)) {
 	*info = -9;
    } else if (*ldc < max(1,*m)) {
 	*info = -11;
    }
    if (*info != 0) {
 	i__1 = -(*info);
 	xerbla_("ZTRSY2", &i__1, (ftnlen)6);
 	return;
    }
    *scale = 1.;
    if (*m == 0 || *n == 0) {
 	return;
    }
    eps = dlamch_("P", (ftnlen)1);
    smlnum = dlamch_("S", (ftnlen)1);
    bignum = 1. / smlnum;
    dlabad_(&smlnum, &bignum);
    smlnum = smlnum * (double) (*m * *n) / eps;
    bignum = 1. / smlnum;
 /* Computing MAX */
    d__1 = smlnum, d__2 = eps * zlange_("M", m, m, &a[a_offset], lda, dum, (
 	    ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * zlange_("M", n, n,
 	    &b[b_offset], ldb, dum, (ftnlen)1);
    smin = max(d__1,d__2);
    sgn = (double) (*isgn);
    if (notrna && notrnb) {
 	i__1 = *n;
 	for (l = 1; l <= i__1; ++l) {
 	    for (k = *m; k >= 1; --k) {
 		i__2 = *m - k;
 /* Computing MIN */
 		i__3 = k + 1;
 /* Computing MIN */
 		i__4 = k + 1;
 		z__1 = zdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
 			min(i__4,*m) + l * c_dim1], &c__1);
 		suml.r = z__1.r, suml.i = z__1.i;
 		i__2 = l - 1;
 		z__1 = zdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
 			, &c__1);
 		sumr.r = z__1.r, sumr.i = z__1.i;
 		i__2 = k + l * c_dim1;
 		z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
 		z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
 		z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
 		vec.r = z__1.r, vec.i = z__1.i;
 		scaloc = 1.;
 		i__2 = k + k * a_dim1;
 		i__3 = l + l * b_dim1;
 		z__2.r = sgn * b[i__3].r, z__2.i = sgn * b[i__3].i;
 		z__1.r = a[i__2].r + z__2.r, z__1.i = a[i__2].i + z__2.i;
 		a11.r = z__1.r, a11.i = z__1.i;
 		da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
 			d__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
 			d__2));
 		if (da11 < 1. && db > 1.) {
 		    if (db > bignum * da11) {
 			scaloc = 1. / db;
 		    }
 		}
 		z__3.r = scaloc, z__3.i = 0.;
 		z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
 			z__3.i + vec.i * z__3.r;
 		z__1 = zladiv_(&z__2, &a11);
 		x11.r = z__1.r, x11.i = z__1.i;
 		if (scaloc != 1.) {
 		    i__2 = *n;
 		    for (j = 1; j <= i__2; ++j) {
 			zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L10: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__2 = k + l * c_dim1;
 		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
 /* L20: */
 	    }
 /* L30: */
 	}
    } else if (! notrna && notrnb) {
 	i__1 = *n;
 	for (l = 1; l <= i__1; ++l) {
 	    i__2 = *m;
 	    for (k = 1; k <= i__2; ++k) {
 		i__3 = k - 1;
 		z__1 = zdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
 			c_dim1 + 1], &c__1);
 		suml.r = z__1.r, suml.i = z__1.i;
 		i__3 = l - 1;
 		z__1 = zdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
 			, &c__1);
 		sumr.r = z__1.r, sumr.i = z__1.i;
 		i__3 = k + l * c_dim1;
 		z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
 		z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
 		z__1.r = c__[i__3].r - z__2.r, z__1.i = c__[i__3].i - z__2.i;
 		vec.r = z__1.r, vec.i = z__1.i;
 		scaloc = 1.;
 		d_cnjg(&z__2, &a[k + k * a_dim1]);
 		i__3 = l + l * b_dim1;
 		z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
 		z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
 		a11.r = z__1.r, a11.i = z__1.i;
 		da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
 			d__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
 			d__2));
 		if (da11 < 1. && db > 1.) {
 		    if (db > bignum * da11) {
 			scaloc = 1. / db;
 		    }
 		}
 		z__3.r = scaloc, z__3.i = 0.;
 		z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
 			z__3.i + vec.i * z__3.r;
 		z__1 = zladiv_(&z__2, &a11);
 		x11.r = z__1.r, x11.i = z__1.i;
 		if (scaloc != 1.) {
 		    i__3 = *n;
 		    for (j = 1; j <= i__3; ++j) {
 			zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L40: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__3 = k + l * c_dim1;
 		c__[i__3].r = x11.r, c__[i__3].i = x11.i;
 /* L50: */
 	    }
 /* L60: */
 	}
    } else if (! notrna && ! notrnb) {
 	for (l = *n; l >= 1; --l) {
 	    i__1 = *m;
 	    for (k = 1; k <= i__1; ++k) {
 		i__2 = k - 1;
 		z__1 = zdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
 			c_dim1 + 1], &c__1);
 		suml.r = z__1.r, suml.i = z__1.i;
 		i__2 = *n - l;
 /* Computing MIN */
 		i__3 = l + 1;
 /* Computing MIN */
 		i__4 = l + 1;
 		z__1 = zdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
 			l + min(i__4,*n) * b_dim1], ldb);
 		sumr.r = z__1.r, sumr.i = z__1.i;
 		i__2 = k + l * c_dim1;
 		d_cnjg(&z__4, &sumr);
 		z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
 		z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
 		z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
 		vec.r = z__1.r, vec.i = z__1.i;
 		scaloc = 1.;
 		i__2 = k + k * a_dim1;
 		i__3 = l + l * b_dim1;
 		z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
 		z__2.r = a[i__2].r + z__3.r, z__2.i = a[i__2].i + z__3.i;
 		d_cnjg(&z__1, &z__2);
 		a11.r = z__1.r, a11.i = z__1.i;
 		da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
 			d__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
 			d__2));
 		if (da11 < 1. && db > 1.) {
 		    if (db > bignum * da11) {
 			scaloc = 1. / db;
 		    }
 		}
 		z__3.r = scaloc, z__3.i = 0.;
 		z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
 			z__3.i + vec.i * z__3.r;
 		z__1 = zladiv_(&z__2, &a11);
 		x11.r = z__1.r, x11.i = z__1.i;
 		if (scaloc != 1.) {
 		    i__2 = *n;
 		    for (j = 1; j <= i__2; ++j) {
 			zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L70: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__2 = k + l * c_dim1;
 		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
 /* L80: */
 	    }
 /* L90: */
 	}
    } else if (notrna && ! notrnb) {
 	for (l = *n; l >= 1; --l) {
 	    for (k = *m; k >= 1; --k) {
 		i__1 = *m - k;
 /* Computing MIN */
 		i__2 = k + 1;
 /* Computing MIN */
 		i__3 = k + 1;
 		z__1 = zdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
 			min(i__3,*m) + l * c_dim1], &c__1);
 		suml.r = z__1.r, suml.i = z__1.i;
 		i__1 = *n - l;
 /* Computing MIN */
 		i__2 = l + 1;
 /* Computing MIN */
 		i__3 = l + 1;
 		z__1 = zdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
 			l + min(i__3,*n) * b_dim1], ldb);
 		sumr.r = z__1.r, sumr.i = z__1.i;
 		i__1 = k + l * c_dim1;
 		d_cnjg(&z__4, &sumr);
 		z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
 		z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
 		z__1.r = c__[i__1].r - z__2.r, z__1.i = c__[i__1].i - z__2.i;
 		vec.r = z__1.r, vec.i = z__1.i;
 		scaloc = 1.;
 		i__1 = k + k * a_dim1;
 		d_cnjg(&z__3, &b[l + l * b_dim1]);
 		z__2.r = sgn * z__3.r, z__2.i = sgn * z__3.i;
 		z__1.r = a[i__1].r + z__2.r, z__1.i = a[i__1].i + z__2.i;
 		a11.r = z__1.r, a11.i = z__1.i;
 		da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
 			d__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
 			d__2));
 		if (da11 < 1. && db > 1.) {
 		    if (db > bignum * da11) {
 			scaloc = 1. / db;
 		    }
 		}
 		z__3.r = scaloc, z__3.i = 0.;
 		z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
 			z__3.i + vec.i * z__3.r;
 		z__1 = zladiv_(&z__2, &a11);
 		x11.r = z__1.r, x11.i = z__1.i;
 		if (scaloc != 1.) {
 		    i__1 = *n;
 		    for (j = 1; j <= i__1; ++j) {
 			zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L100: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__1 = k + l * c_dim1;
 		c__[i__1].r = x11.r, c__[i__1].i = x11.i;
 /* L110: */
 	    }
 /* L120: */
 	}
    }
    return;
 }
--- a/relapack/src/ztrtri.c
+++ b/relapack/src/ztrtri.c
@@ -0,0 +1,107 @@
 #include "relapack.h"

 static void RELAPACK_ztrtri_rec(const char *, const char *, const int *,
    double *, const int *, int *);


 /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
 *
 * This routine is functionally equivalent to LAPACK's ztrtri.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html
 * */
 void RELAPACK_ztrtri(
    const char *uplo, const char *diag, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int nounit = LAPACK(lsame)(diag, "N");
    const int unit = LAPACK(lsame)(diag, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (!nounit && !unit)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("ZTRTRI", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower  ? 'L' : 'U';
    const char cleandiag = nounit ? 'N' : 'U';

    // check for singularity
    if (nounit) {
        int i;
        for (i = 0; i < *n; i++)
            if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
                *info = i;
                return;
            }
    }

    // Recursive kernel
    RELAPACK_ztrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
 }


 /** ztrtri's recursive compute kernel */
 static void RELAPACK_ztrtri_rec(
    const char *uplo, const char *diag, const int *n,
    double *A, const int *ldA,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) {
        // Unblocked
        LAPACK(ztrti2)(uplo, diag, n, A, ldA, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };

    // Splitting
    const int n1 = ZREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + 2 * *ldA * n1;
    double *const A_BL = A                 + 2 * n1;
    double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_ztrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = - A_BL * A_TL
        BLAS(ztrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
        // A_BL = A_BR \ A_BL
        BLAS(ztrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TR = - A_TL * A_TR
        BLAS(ztrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
        // A_TR = A_TR / A_BR
        BLAS(ztrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_ztrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }