Add a BLAS3-based triangular Sylvester equation solver (Reference-LAPACK PR 651)

3 years ago · 6eb707d941
--- a/lapack-netlib/SRC/clatrs3.f
+++ b/lapack-netlib/SRC/clatrs3.f
@@ -0,0 +1,666 @@
 *> \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
 *
 *  Definition:
 *  ===========
 *
 *      SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
 *                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          DIAG, NORMIN, TRANS, UPLO
 *       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
 *       ..
 *       .. Array Arguments ..
 *       REAL               CNORM( * ), SCALE( * ), WORK( * )
 *       COMPLEX            A( LDA, * ), X( LDX, * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> CLATRS3 solves one of the triangular systems
 *>
 *>    A * X = B * diag(scale),  A**T * X = B * diag(scale), or
 *>    A**H * X = B * diag(scale)
 *>
 *> with scaling to prevent overflow.  Here A is an upper or lower
 *> triangular matrix, A**T denotes the transpose of A, A**H denotes the
 *> conjugate transpose of A. X and B are n-by-nrhs matrices and scale
 *> is an nrhs-element vector of scaling factors. A scaling factor scale(j)
 *> is usually less than or equal to 1, chosen such that X(:,j) is less
 *> than the overflow threshold. If the matrix A is singular (A(j,j) = 0
 *> for some j), then a non-trivial solution to A*X = 0 is returned. If
 *> the system is so badly scaled that the solution cannot be represented
 *> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
 *>
 *> This is a BLAS-3 version of LATRS for solving several right
 *> hand sides simultaneously.
 *>
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] UPLO
 *> \verbatim
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the matrix A is upper or lower triangular.
 *>          = 'U':  Upper triangular
 *>          = 'L':  Lower triangular
 *> \endverbatim
 *>
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
 *>          Specifies the operation applied to A.
 *>          = 'N':  Solve A * x = s*b  (No transpose)
 *>          = 'T':  Solve A**T* x = s*b  (Transpose)
 *>          = 'C':  Solve A**T* x = s*b  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] DIAG
 *> \verbatim
 *>          DIAG is CHARACTER*1
 *>          Specifies whether or not the matrix A is unit triangular.
 *>          = 'N':  Non-unit triangular
 *>          = 'U':  Unit triangular
 *> \endverbatim
 *>
 *> \param[in] NORMIN
 *> \verbatim
 *>          NORMIN is CHARACTER*1
 *>          Specifies whether CNORM has been set or not.
 *>          = 'Y':  CNORM contains the column norms on entry
 *>          = 'N':  CNORM is not set on entry.  On exit, the norms will
 *>                  be computed and stored in CNORM.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] NRHS
 *> \verbatim
 *>          NRHS is INTEGER
 *>          The number of columns of X.  NRHS >= 0.
 *> \endverbatim
 *>
 *> \param[in] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
 *>          The triangular matrix A.  If UPLO = 'U', the leading n by n
 *>          upper triangular part of the array A contains the upper
 *>          triangular matrix, and the strictly lower triangular part of
 *>          A is not referenced.  If UPLO = 'L', the leading n by n lower
 *>          triangular part of the array A contains the lower triangular
 *>          matrix, and the strictly upper triangular part of A is not
 *>          referenced.  If DIAG = 'U', the diagonal elements of A are
 *>          also not referenced and are assumed to be 1.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max (1,N).
 *> \endverbatim
 *>
 *> \param[in,out] X
 *> \verbatim
 *>          X is COMPLEX array, dimension (LDX,NRHS)
 *>          On entry, the right hand side B of the triangular system.
 *>          On exit, X is overwritten by the solution matrix X.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
 *>          The leading dimension of the array X.  LDX >= max (1,N).
 *> \endverbatim
 *>
 *> \param[out] SCALE
 *> \verbatim
 *>          SCALE is REAL array, dimension (NRHS)
 *>          The scaling factor s(k) is for the triangular system
 *>          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k).
 *>          If SCALE = 0, the matrix A is singular or badly scaled.
 *>          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
 *>          that is an exact or approximate solution to A*x(:,k) = 0
 *>          is returned. If the system so badly scaled that solution
 *>          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
 *>          is returned.
 *> \endverbatim
 *>
 *> \param[in,out] CNORM
 *> \verbatim
 *>          CNORM is REAL array, dimension (N)
 *>
 *>          If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
 *>          contains the norm of the off-diagonal part of the j-th column
 *>          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
 *>          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
 *>          must be greater than or equal to the 1-norm.
 *>
 *>          If NORMIN = 'N', CNORM is an output argument and CNORM(j)
 *>          returns the 1-norm of the offdiagonal part of the j-th column
 *>          of A.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (LWORK).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *>          LWORK is INTEGER
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -k, the k-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERauxiliary
 *> \par Further Details:
 *  =====================
 *  \verbatim
 *  The algorithm follows the structure of a block triangular solve.
 *  The diagonal block is solved with a call to the robust the triangular
 *  solver LATRS for every right-hand side RHS = 1, ..., NRHS
 *     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
 *  where op( A ) = A or op( A ) = A**T or op( A ) = A**H.
 *  The linear block updates operate on block columns of X,
 *     B( I, K ) - op(A( I, J )) * X( J, K )
 *  and use GEMM. To avoid overflow in the linear block update, the worst case
 *  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
 *  such that
 *     || s * B( I, RHS )||_oo
 *   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold
 *
 *  Once all columns of a block column have been rescaled (BLAS-1), the linear
 *  update is executed with GEMM without overflow.
 *
 *  To limit rescaling, local scale factors track the scaling of column segments.
 *  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
 *  per right-hand side column RHS = 1, ..., NRHS. The global scale factor
 *  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
 *  I = 1, ..., NBA.
 *  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
 *  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
 *  linear update of potentially inconsistently scaled vector segments
 *     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
 *  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
 *  if necessary, rescales the blocks prior to calling GEMM.
 *
 *  \endverbatim
 *  =====================================================================
 *  References:
 *  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
 *  Parallel robust solution of triangular linear systems. Concurrency
 *  and Computation: Practice and Experience, 31(19), e5064.
 *
 *  Contributor:
 *   Angelika Schwarz, Umea University, Sweden.
 *
 *  =====================================================================
      SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
     $                    X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
      IMPLICIT NONE
 *
 *     .. Scalar Arguments ..
      CHARACTER          DIAG, TRANS, NORMIN, UPLO
      INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
 *     ..
 *     .. Array Arguments ..
      COMPLEX            A( LDA, * ), X( LDX, * )
      REAL               CNORM( * ), SCALE( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      REAL               ZERO, ONE
      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
      COMPLEX            CZERO, CONE
      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ) )
      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
      INTEGER            NBMAX, NBMIN, NBRHS, NRHSMIN
      PARAMETER          ( NRHSMIN = 2, NBRHS = 32 )
      PARAMETER          ( NBMIN = 8, NBMAX = 64 )
 *     ..
 *     .. Local Arrays ..
      REAL               W( NBMAX ), XNRM( NBRHS )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
      INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
     $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
      REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
     $                   SCAMIN, SMLNUM, TMAX
 *     ..
 *     .. External Functions ..
      LOGICAL            LSAME
      INTEGER            ILAENV
      REAL               SLAMCH, CLANGE, SLARMM
      EXTERNAL           ILAENV, LSAME, SLAMCH, CLANGE, SLARMM
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CLATRS, CSSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          ABS, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
      INFO = 0
      UPPER = LSAME( UPLO, 'U' )
      NOTRAN = LSAME( TRANS, 'N' )
      NOUNIT = LSAME( DIAG, 'N' )
      LQUERY = ( LWORK.EQ.-1 )
 *
 *     Partition A and X into blocks.
 *
      NB = MAX( NBMIN, ILAENV( 1, 'CLATRS', '', N, N, -1, -1 ) )
      NB = MIN( NBMAX, NB )
      NBA = MAX( 1, (N + NB - 1) / NB )
      NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
 *
 *     Compute the workspace
 *
 *     The workspace comprises two parts.
 *     The first part stores the local scale factors. Each simultaneously
 *     computed right-hand side requires one local scale factor per block
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
      LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
      LDS = NBA
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
      LANRM = NBA * NBA
      AWRK = LSCALE
      WORK( 1 ) = LSCALE + LANRM
 *
 *     Test the input parameters.
 *
      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
         INFO = -1
      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
     $         LSAME( TRANS, 'C' ) ) THEN
         INFO = -2
      ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
         INFO = -3
      ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
     $         LSAME( NORMIN, 'N' ) ) THEN
         INFO = -4
      ELSE IF( N.LT.0 ) THEN
         INFO = -5
      ELSE IF( NRHS.LT.0 ) THEN
         INFO = -6
      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
         INFO = -8
      ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
         INFO = -10
      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
         INFO = -14
      END IF
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLATRS3', -INFO )
         RETURN
      ELSE IF( LQUERY ) THEN
         RETURN
      END IF
 *
 *     Initialize scaling factors
 *
      DO KK = 1, NRHS
         SCALE( KK ) = ONE
      END DO
 *
 *     Quick return if possible
 *
      IF( MIN( N, NRHS ).EQ.0 )
     $   RETURN
 *
 *     Determine machine dependent constant to control overflow.
 *
      BIGNUM = SLAMCH( 'Overflow' )
      SMLNUM = SLAMCH( 'Safe Minimum' )
 *
 *     Use unblocked code for small problems
 *
      IF( NRHS.LT.NRHSMIN ) THEN
         CALL CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1 ),
     $                SCALE( 1 ), CNORM, INFO )
         DO K = 2, NRHS
            CALL CLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
     $                   SCALE( K ), CNORM, INFO )
         END DO
         RETURN
      END IF
 *
 *     Compute norms of blocks of A excluding diagonal blocks and find
 *     the block with the largest norm TMAX.
 *
      TMAX = ZERO
      DO J = 1, NBA
         J1 = (J-1)*NB + 1
         J2 = MIN( J*NB, N ) + 1
         IF ( UPPER ) THEN
            IFIRST = 1
            ILAST = J - 1
         ELSE
            IFIRST = J + 1
            ILAST = NBA
         END IF
         DO I = IFIRST, ILAST
            I1 = (I-1)*NB + 1
            I2 = MIN( I*NB, N ) + 1
 *
 *           Compute upper bound of A( I1:I2-1, J1:J2-1 ).
 *
            IF( NOTRAN ) THEN
               ANRM = CLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
               WORK( AWRK + I+(J-1)*NBA ) = ANRM
            ELSE
               ANRM = CLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
               WORK( AWRK + J+(I-1)*NBA ) = ANRM
            END IF
            TMAX = MAX( TMAX, ANRM )
         END DO
      END DO
 *
      IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN
 *
 *        Some matrix entries have huge absolute value. At least one upper
 *        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
 *        number, either due to overflow in LANGE or due to Inf in A.
 *        Fall back to LATRS. Set normin = 'N' for every right-hand side to
 *        force computation of TSCAL in LATRS to avoid the likely overflow
 *        in the computation of the column norms CNORM.
 *
         DO K = 1, NRHS
            CALL CLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
     $                   SCALE( K ), CNORM, INFO )
         END DO
         RETURN
      END IF
 *
 *     Every right-hand side requires workspace to store NBA local scale
 *     factors. To save workspace, X is computed successively in block columns
 *     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
 *     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
      DO K = 1, NBX
 *        Loop over block columns (index = K) of X and, for column-wise scalings,
 *        over individual columns (index = KK).
 *        K1: column index of the first column in X( J, K )
 *        K2: column index of the first column in X( J, K+1 )
 *        so the K2 - K1 is the column count of the block X( J, K )
         K1 = (K-1)*NBRHS + 1
         K2 = MIN( K*NBRHS, NRHS ) + 1
 *
 *        Initialize local scaling factors of current block column X( J, K )
 *
         DO KK = 1, K2-K1
            DO I = 1, NBA
               WORK( I+KK*LDS ) = ONE
            END DO
         END DO
 *
         IF( NOTRAN ) THEN
 *
 *           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
 *
            IF( UPPER ) THEN
               JFIRST = NBA
               JLAST = 1
               JINC = -1
            ELSE
               JFIRST = 1
               JLAST = NBA
               JINC = 1
            END IF
         ELSE
 *
 *           Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
 *           where op(A) = A**T or op(A) = A**H
 *
            IF( UPPER ) THEN
               JFIRST = 1
               JLAST = NBA
               JINC = 1
            ELSE
               JFIRST = NBA
               JLAST = 1
               JINC = -1
            END IF
         END IF

         DO J = JFIRST, JLAST, JINC
 *           J1: row index of the first row in A( J, J )
 *           J2: row index of the first row in A( J+1, J+1 )
 *           so that J2 - J1 is the row count of the block A( J, J )
            J1 = (J-1)*NB + 1
            J2 = MIN( J*NB, N ) + 1
 *
 *           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
 *
            DO KK = 1, K2-K1
               RHS = K1 + KK - 1
               IF( KK.EQ.1 ) THEN
                  CALL CLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
     $                         A( J1, J1 ), LDA, X( J1, RHS ),
     $                         SCALOC, CNORM, INFO )
               ELSE
                  CALL CLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
     $                         A( J1, J1 ), LDA, X( J1, RHS ),
     $                         SCALOC, CNORM, INFO )
               END IF
 *              Find largest absolute value entry in the vector segment
 *              X( J1:J2-1, RHS ) as an upper bound for the worst case
 *              growth in the linear updates.
               XNRM( KK ) = CLANGE( 'I', J2-J1, 1, X( J1, RHS ),
     $                              LDX, W )
 *
               IF( SCALOC .EQ. ZERO ) THEN
 *                 LATRS found that A is singular through A(j,j) = 0.
 *                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
 *                 and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is
 *                 set by LATRS.
                  SCALE( RHS ) = ZERO
                  DO II = 1, J1-1
                     X( II, KK ) = CZERO
                  END DO
                  DO II = J2, N
                     X( II, KK ) = CZERO
                  END DO
 *                 Discard the local scale factors.
                  DO II = 1, NBA
                     WORK( II+KK*LDS ) = ONE
                  END DO
                  SCALOC = ONE
               ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN
 *                 LATRS computed a valid scale factor, but combined with
 *                 the current scaling the solution does not have a
 *                 scale factor > 0.
 *
 *                 Set WORK( J+KK*LDS ) to smallest valid scale
 *                 factor and increase SCALOC accordingly.
                  SCAL = WORK( J+KK*LDS ) / SMLNUM
                  SCALOC = SCALOC * SCAL
                  WORK( J+KK*LDS ) = SMLNUM
 *                 If LATRS overestimated the growth, x may be
 *                 rescaled to preserve a valid combined scale
 *                 factor WORK( J, KK ) > 0.
                  RSCAL = ONE / SCALOC
                  IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN
                     XNRM( KK ) = XNRM( KK ) * RSCAL
                     CALL CSSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
                     SCALOC = ONE
                  ELSE
 *                    The system op(A) * x = b is badly scaled and its
 *                    solution cannot be represented as (1/scale) * x.
 *                    Set x to zero. This approach deviates from LATRS
 *                    where a completely meaningless non-zero vector
 *                    is returned that is not a solution to op(A) * x = b.
                     SCALE( RHS ) = ZERO
                     DO II = 1, N
                        X( II, KK ) = CZERO
                     END DO
 *                    Discard the local scale factors.
                     DO II = 1, NBA
                        WORK( II+KK*LDS ) = ONE
                     END DO
                     SCALOC = ONE
                  END IF
               END IF
               SCALOC = SCALOC * WORK( J+KK*LDS )
               WORK( J+KK*LDS ) = SCALOC
            END DO
 *
 *           Linear block updates
 *
            IF( NOTRAN ) THEN
               IF( UPPER ) THEN
                  IFIRST = J - 1
                  ILAST = 1
                  IINC = -1
               ELSE
                  IFIRST = J + 1
                  ILAST = NBA
                  IINC = 1
               END IF
            ELSE
               IF( UPPER ) THEN
                  IFIRST = J + 1
                  ILAST = NBA
                  IINC = 1
               ELSE
                  IFIRST = J - 1
                  ILAST = 1
                  IINC = -1
               END IF
            END IF
 *
            DO I = IFIRST, ILAST, IINC
 *              I1: row index of the first column in X( I, K )
 *              I2: row index of the first column in X( I+1, K )
 *              so the I2 - I1 is the row count of the block X( I, K )
               I1 = (I-1)*NB + 1
               I2 = MIN( I*NB, N ) + 1
 *
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
 *              rescale the column segments, if necesssary. Then
 *              the linear update is safely executed.
 *
               DO KK = 1, K2-K1
                  RHS = K1 + KK - 1
 *                 Compute consistent scaling
                  SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) )
 *
 *                 Compute scaling factor to survive the linear update
 *                 simulating consistent scaling.
 *
                  BNRM = CLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
                  BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
                  XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) )
                  ANRM = WORK( AWRK + I+(J-1)*NBA )
                  SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM )
 *
 *                 Simultaneously apply the robust update factor and the
 *                 consistency scaling factor to X( I, KK ) and X( J, KK ).
 *
                  SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
                  IF( SCAL.NE.ONE ) THEN
                     CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
                     WORK( I+KK*LDS ) = SCAMIN*SCALOC
                  END IF
 *
                  SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
                  IF( SCAL.NE.ONE ) THEN
                     CALL CSSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
                     WORK( J+KK*LDS ) = SCAMIN*SCALOC
                  END IF
               END DO
 *
               IF( NOTRAN ) THEN
 *
 *                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
 *
                  CALL CGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE,
     $                        A( I1, J1 ), LDA, X( J1, K1 ), LDX,
     $                        CONE, X( I1, K1 ), LDX )
               ELSE IF( LSAME( TRANS, 'T' ) ) THEN
 *
 *                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
 *
                  CALL CGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE,
     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
     $                        CONE, X( I1, K1 ), LDX )
               ELSE
 *
 *                 B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K )
 *
                  CALL CGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE,
     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
     $                        CONE, X( I1, K1 ), LDX )
               END IF
            END DO
         END DO
 *
 *        Reduce local scaling factors
 *
         DO KK = 1, K2-K1
            RHS = K1 + KK - 1
            DO I = 1, NBA
               SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
            END DO
         END DO
 *
 *        Realize consistent scaling
 *
         DO KK = 1, K2-K1
            RHS = K1 + KK - 1
            IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
               DO I = 1, NBA
                  I1 = (I-1)*NB + 1
                  I2 = MIN( I*NB, N ) + 1
                  SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
                  IF( SCAL.NE.ONE )
     $               CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
               END DO
            END IF
         END DO
      END DO
      RETURN
 *
 *     End of CLATRS3
 *
      END
--- a/lapack-netlib/SRC/ctrsyl3.f
+++ b/lapack-netlib/SRC/ctrsyl3.f
--- a/lapack-netlib/SRC/dlarmm.f
+++ b/lapack-netlib/SRC/dlarmm.f
@@ -0,0 +1,99 @@
 *> \brief \b DLARMM
 *
 * Definition:
 * ===========
 *
 *      DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM )
 *
 *     .. Scalar Arguments ..
 *      DOUBLE PRECISION   ANORM, BNORM, CNORM
 *     ..
 *
 *>  \par Purpose:
 *  =======
 *>
 *> \verbatim
 *>
 *> DLARMM returns a factor s in (0, 1] such that the linear updates
 *>
 *>    (s * C) - A * (s * B)  and  (s * C) - (s * A) * B
 *>
 *> cannot overflow, where A, B, and C are matrices of conforming
 *> dimensions.
 *>
 *> This is an auxiliary routine so there is no argument checking.
 *> \endverbatim
 *
 *  Arguments:
 *  =========
 *
 *> \param[in] ANORM
 *> \verbatim
 *>          ANORM is DOUBLE PRECISION
 *>          The infinity norm of A. ANORM >= 0.
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] BNORM
 *> \verbatim
 *>          BNORM is DOUBLE PRECISION
 *>          The infinity norm of B. BNORM >= 0.
 *> \endverbatim
 *>
 *> \param[in] CNORM
 *> \verbatim
 *>          CNORM is DOUBLE PRECISION
 *>          The infinity norm of C. CNORM >= 0.
 *> \endverbatim
 *>
 *>
 *  =====================================================================
 *>  References:
 *>    C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for
 *>    Robust Solution of Triangular Linear Systems. In: International
 *>    Conference on Parallel Processing and Applied Mathematics, pages
 *>    68--78. Springer, 2017.
 *>
 *> \ingroup OTHERauxiliary
 *  =====================================================================

      DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM )
      IMPLICIT NONE
 *     .. Scalar Arguments ..
      DOUBLE PRECISION   ANORM, BNORM, CNORM
 *     .. Parameters ..
      DOUBLE PRECISION   ONE, HALF, FOUR
      PARAMETER          ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 )
 *     ..
 *     .. Local Scalars ..
       DOUBLE PRECISION   BIGNUM, SMLNUM
 *     ..
 *     .. External Functions ..
      DOUBLE PRECISION   DLAMCH
      EXTERNAL           DLAMCH
 *     ..
 *     .. Executable Statements ..
 *
 *
 *     Determine machine dependent parameters to control overflow.
 *
      SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' )
      BIGNUM = ( ONE / SMLNUM ) / FOUR
 *
 *     Compute a scale factor.
 *
      DLARMM = ONE
      IF( BNORM .LE. ONE ) THEN
         IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN
            DLARMM = HALF
         END IF
      ELSE
         IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN
            DLARMM = HALF / BNORM
         END IF
      END IF
      RETURN
 *
 *     ==== End of DLARMM ====
 *
      END
--- a/lapack-netlib/SRC/dlatrs3.f
+++ b/lapack-netlib/SRC/dlatrs3.f
@@ -0,0 +1,656 @@
 *> \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
 *
 *  Definition:
 *  ===========
 *
 *      SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
 *                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          DIAG, NORMIN, TRANS, UPLO
 *       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
 *       ..
 *       .. Array Arguments ..
 *       DOUBLE PRECISION   A( LDA, * ), CNORM( * ), SCALE( * ), 
 *                          WORK( * ), X( LDX, * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> DLATRS3 solves one of the triangular systems
 *>
 *>    A * X = B * diag(scale)  or  A**T * X = B * diag(scale)
 *>
 *> with scaling to prevent overflow.  Here A is an upper or lower
 *> triangular matrix, A**T denotes the transpose of A. X and B are
 *> n by nrhs matrices and scale is an nrhs element vector of scaling
 *> factors. A scaling factor scale(j) is usually less than or equal
 *> to 1, chosen such that X(:,j) is less than the overflow threshold.
 *> If the matrix A is singular (A(j,j) = 0 for some j), then
 *> a non-trivial solution to A*X = 0 is returned. If the system is
 *> so badly scaled that the solution cannot be represented as
 *> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
 *>
 *> This is a BLAS-3 version of LATRS for solving several right
 *> hand sides simultaneously.
 *>
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] UPLO
 *> \verbatim
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the matrix A is upper or lower triangular.
 *>          = 'U':  Upper triangular
 *>          = 'L':  Lower triangular
 *> \endverbatim
 *>
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
 *>          Specifies the operation applied to A.
 *>          = 'N':  Solve A * x = s*b  (No transpose)
 *>          = 'T':  Solve A**T* x = s*b  (Transpose)
 *>          = 'C':  Solve A**T* x = s*b  (Conjugate transpose = Transpose)
 *> \endverbatim
 *>
 *> \param[in] DIAG
 *> \verbatim
 *>          DIAG is CHARACTER*1
 *>          Specifies whether or not the matrix A is unit triangular.
 *>          = 'N':  Non-unit triangular
 *>          = 'U':  Unit triangular
 *> \endverbatim
 *>
 *> \param[in] NORMIN
 *> \verbatim
 *>          NORMIN is CHARACTER*1
 *>          Specifies whether CNORM has been set or not.
 *>          = 'Y':  CNORM contains the column norms on entry
 *>          = 'N':  CNORM is not set on entry.  On exit, the norms will
 *>                  be computed and stored in CNORM.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] NRHS
 *> \verbatim
 *>          NRHS is INTEGER
 *>          The number of columns of X.  NRHS >= 0.
 *> \endverbatim
 *>
 *> \param[in] A
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>          The triangular matrix A.  If UPLO = 'U', the leading n by n
 *>          upper triangular part of the array A contains the upper
 *>          triangular matrix, and the strictly lower triangular part of
 *>          A is not referenced.  If UPLO = 'L', the leading n by n lower
 *>          triangular part of the array A contains the lower triangular
 *>          matrix, and the strictly upper triangular part of A is not
 *>          referenced.  If DIAG = 'U', the diagonal elements of A are
 *>          also not referenced and are assumed to be 1.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max (1,N).
 *> \endverbatim
 *>
 *> \param[in,out] X
 *> \verbatim
 *>          X is DOUBLE PRECISION array, dimension (LDX,NRHS)
 *>          On entry, the right hand side B of the triangular system.
 *>          On exit, X is overwritten by the solution matrix X.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
 *>          The leading dimension of the array X.  LDX >= max (1,N).
 *> \endverbatim
 *>
 *> \param[out] SCALE
 *> \verbatim
 *>          SCALE is DOUBLE PRECISION array, dimension (NRHS)
 *>          The scaling factor s(k) is for the triangular system
 *>          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k).
 *>          If SCALE = 0, the matrix A is singular or badly scaled.
 *>          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
 *>          that is an exact or approximate solution to A*x(:,k) = 0
 *>          is returned. If the system so badly scaled that solution
 *>          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
 *>          is returned.
 *> \endverbatim
 *>
 *> \param[in,out] CNORM
 *> \verbatim
 *>          CNORM is DOUBLE PRECISION array, dimension (N)
 *>
 *>          If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
 *>          contains the norm of the off-diagonal part of the j-th column
 *>          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
 *>          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
 *>          must be greater than or equal to the 1-norm.
 *>
 *>          If NORMIN = 'N', CNORM is an output argument and CNORM(j)
 *>          returns the 1-norm of the offdiagonal part of the j-th column
 *>          of A.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (LWORK).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *>          LWORK is INTEGER
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -k, the k-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERauxiliary
 *> \par Further Details:
 *  =====================
 *  \verbatim
 *  The algorithm follows the structure of a block triangular solve.
 *  The diagonal block is solved with a call to the robust the triangular
 *  solver LATRS for every right-hand side RHS = 1, ..., NRHS
 *     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
 *  where op( A ) = A or op( A ) = A**T.
 *  The linear block updates operate on block columns of X,
 *     B( I, K ) - op(A( I, J )) * X( J, K )
 *  and use GEMM. To avoid overflow in the linear block update, the worst case
 *  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
 *  such that
 *     || s * B( I, RHS )||_oo
 *   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold
 *
 *  Once all columns of a block column have been rescaled (BLAS-1), the linear
 *  update is executed with GEMM without overflow.
 *
 *  To limit rescaling, local scale factors track the scaling of column segments.
 *  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
 *  per right-hand side column RHS = 1, ..., NRHS. The global scale factor
 *  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
 *  I = 1, ..., NBA.
 *  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
 *  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
 *  linear update of potentially inconsistently scaled vector segments
 *     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
 *  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
 *  if necessary, rescales the blocks prior to calling GEMM.
 *
 *  \endverbatim
 *  =====================================================================
 *  References:
 *  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
 *  Parallel robust solution of triangular linear systems. Concurrency
 *  and Computation: Practice and Experience, 31(19), e5064.
 *
 *  Contributor:
 *   Angelika Schwarz, Umea University, Sweden.
 *
 *  =====================================================================
      SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
     $                    X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
      IMPLICIT NONE
 *
 *     .. Scalar Arguments ..
      CHARACTER          DIAG, TRANS, NORMIN, UPLO
      INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
 *     ..
 *     .. Array Arguments ..
      DOUBLE PRECISION   A( LDA, * ), CNORM( * ), X( LDX, * ),
     $                   SCALE( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ZERO, ONE
      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
      INTEGER            NBMAX, NBMIN, NBRHS, NRHSMIN
      PARAMETER          ( NRHSMIN = 2, NBRHS = 32 )
      PARAMETER          ( NBMIN = 8, NBMAX = 64 )
 *     ..
 *     .. Local Arrays ..
      DOUBLE PRECISION   W( NBMAX ), XNRM( NBRHS )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
      INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
     $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
      DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
     $                   SCAMIN, SMLNUM, TMAX
 *     ..
 *     .. External Functions ..
      LOGICAL            LSAME
      INTEGER            ILAENV
      DOUBLE PRECISION   DLAMCH, DLANGE, DLARMM
      EXTERNAL           DLAMCH, DLANGE, DLARMM, ILAENV, LSAME
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DLATRS, DSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          ABS, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
      INFO = 0
      UPPER = LSAME( UPLO, 'U' )
      NOTRAN = LSAME( TRANS, 'N' )
      NOUNIT = LSAME( DIAG, 'N' )
      LQUERY = ( LWORK.EQ.-1 )
 *
 *     Partition A and X into blocks
 *
      NB = MAX( 8, ILAENV( 1, 'DLATRS', '', N, N, -1, -1 ) )
      NB = MIN( NBMAX, NB )
      NBA = MAX( 1, (N + NB - 1) / NB )
      NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
 *
 *     Compute the workspace
 *
 *     The workspace comprises two parts.
 *     The first part stores the local scale factors. Each simultaneously
 *     computed right-hand side requires one local scale factor per block
 *     row. WORK( I+KK*LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
      LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
      LDS = NBA
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
      LANRM = NBA * NBA
      AWRK = LSCALE
      WORK( 1 ) = LSCALE + LANRM
 *
 *     Test the input parameters
 *
      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
         INFO = -1
      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
     $         LSAME( TRANS, 'C' ) ) THEN
         INFO = -2
      ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
         INFO = -3
      ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
     $         LSAME( NORMIN, 'N' ) ) THEN
         INFO = -4
      ELSE IF( N.LT.0 ) THEN
         INFO = -5
      ELSE IF( NRHS.LT.0 ) THEN
         INFO = -6
      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
         INFO = -8
      ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
         INFO = -10
      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
         INFO = -14
      END IF
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLATRS3', -INFO )
         RETURN
      ELSE IF( LQUERY ) THEN
         RETURN
      END IF
 *
 *     Initialize scaling factors
 *
      DO KK = 1, NRHS
         SCALE( KK ) = ONE
      END DO
 *
 *     Quick return if possible
 *
      IF( MIN( N, NRHS ).EQ.0 )
     $   RETURN
 *
 *     Determine machine dependent constant to control overflow.
 *
      BIGNUM = DLAMCH( 'Overflow' )
      SMLNUM = DLAMCH( 'Safe Minimum' )
 *
 *     Use unblocked code for small problems
 *
      IF( NRHS.LT.NRHSMIN ) THEN
         CALL DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1),
     $                SCALE( 1 ), CNORM, INFO )
         DO K = 2, NRHS
            CALL DLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
     $                   SCALE( K ), CNORM, INFO )
         END DO
         RETURN
      END IF
 *
 *     Compute norms of blocks of A excluding diagonal blocks and find
 *     the block with the largest norm TMAX.
 *
      TMAX = ZERO
      DO J = 1, NBA
         J1 = (J-1)*NB + 1
         J2 = MIN( J*NB, N ) + 1
         IF ( UPPER ) THEN
            IFIRST = 1
            ILAST = J - 1
         ELSE
            IFIRST = J + 1
            ILAST = NBA
         END IF
         DO I = IFIRST, ILAST
            I1 = (I-1)*NB + 1
            I2 = MIN( I*NB, N ) + 1
 *
 *           Compute upper bound of A( I1:I2-1, J1:J2-1 ).
 *
            IF( NOTRAN ) THEN
               ANRM = DLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
               WORK( AWRK + I+(J-1)*NBA ) = ANRM
            ELSE
               ANRM = DLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
               WORK( AWRK + J+(I-1)*NBA ) = ANRM
            END IF
            TMAX = MAX( TMAX, ANRM )
         END DO
      END DO
 *
      IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN
 *
 *        Some matrix entries have huge absolute value. At least one upper
 *        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
 *        number, either due to overflow in LANGE or due to Inf in A.
 *        Fall back to LATRS. Set normin = 'N' for every right-hand side to
 *        force computation of TSCAL in LATRS to avoid the likely overflow
 *        in the computation of the column norms CNORM.
 *
         DO K = 1, NRHS
            CALL DLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
     $                   SCALE( K ), CNORM, INFO )
         END DO
         RETURN
      END IF
 *
 *     Every right-hand side requires workspace to store NBA local scale
 *     factors. To save workspace, X is computed successively in block columns
 *     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
 *     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
      DO K = 1, NBX
 *        Loop over block columns (index = K) of X and, for column-wise scalings,
 *        over individual columns (index = KK).
 *        K1: column index of the first column in X( J, K )
 *        K2: column index of the first column in X( J, K+1 )
 *        so the K2 - K1 is the column count of the block X( J, K )
         K1 = (K-1)*NBRHS + 1
         K2 = MIN( K*NBRHS, NRHS ) + 1
 *
 *        Initialize local scaling factors of current block column X( J, K )
 *
         DO KK = 1, K2-K1
            DO I = 1, NBA
               WORK( I+KK*LDS ) = ONE
            END DO
         END DO
 *
         IF( NOTRAN ) THEN
 *
 *           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
 *
            IF( UPPER ) THEN
               JFIRST = NBA
               JLAST = 1
               JINC = -1
            ELSE
               JFIRST = 1
               JLAST = NBA
               JINC = 1
            END IF
         ELSE
 *
 *           Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
 *
            IF( UPPER ) THEN
               JFIRST = 1
               JLAST = NBA
               JINC = 1
            ELSE
               JFIRST = NBA
               JLAST = 1
               JINC = -1
            END IF
         END IF
 *
         DO J = JFIRST, JLAST, JINC
 *           J1: row index of the first row in A( J, J )
 *           J2: row index of the first row in A( J+1, J+1 )
 *           so that J2 - J1 is the row count of the block A( J, J )
            J1 = (J-1)*NB + 1
            J2 = MIN( J*NB, N ) + 1
 *
 *           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
 *           for all right-hand sides in the current block column,
 *           one RHS at a time.
 *
            DO KK = 1, K2-K1
               RHS = K1 + KK - 1
               IF( KK.EQ.1 ) THEN
                  CALL DLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
     $                         A( J1, J1 ), LDA, X( J1, RHS ),
     $                         SCALOC, CNORM, INFO )
               ELSE
                  CALL DLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
     $                         A( J1, J1 ), LDA, X( J1, RHS ),
     $                         SCALOC, CNORM, INFO )
               END IF
 *              Find largest absolute value entry in the vector segment
 *              X( J1:J2-1, RHS ) as an upper bound for the worst case
 *              growth in the linear updates.
               XNRM( KK ) = DLANGE( 'I', J2-J1, 1, X( J1, RHS ),
     $                              LDX, W )
 *
               IF( SCALOC .EQ. ZERO ) THEN
 *                 LATRS found that A is singular through A(j,j) = 0.
 *                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
 *                 and compute A*x = 0 (or A**T*x = 0). Note that
 *                 X(J1:J2-1, KK) is set by LATRS.
                  SCALE( RHS ) = ZERO
                  DO II = 1, J1-1
                     X( II, KK ) = ZERO
                  END DO
                  DO II = J2, N
                     X( II, KK ) = ZERO
                  END DO
 *                 Discard the local scale factors.
                  DO II = 1, NBA
                     WORK( II+KK*LDS ) = ONE
                  END DO
                  SCALOC = ONE
               ELSE IF( SCALOC * WORK( J+KK*LDS ) .EQ. ZERO ) THEN
 *                 LATRS computed a valid scale factor, but combined with
 *                 the current scaling the solution does not have a
 *                 scale factor > 0.
 *
 *                 Set WORK( J+KK*LDS ) to smallest valid scale
 *                 factor and increase SCALOC accordingly.
                  SCAL = WORK( J+KK*LDS ) / SMLNUM
                  SCALOC = SCALOC * SCAL
                  WORK( J+KK*LDS ) = SMLNUM
 *                 If LATRS overestimated the growth, x may be
 *                 rescaled to preserve a valid combined scale
 *                 factor WORK( J, KK ) > 0.
                  RSCAL = ONE / SCALOC
                  IF( XNRM( KK ) * RSCAL .LE. BIGNUM ) THEN
                     XNRM( KK ) = XNRM( KK ) * RSCAL
                     CALL DSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
                     SCALOC = ONE
                  ELSE
 *                    The system op(A) * x = b is badly scaled and its
 *                    solution cannot be represented as (1/scale) * x.
 *                    Set x to zero. This approach deviates from LATRS
 *                    where a completely meaningless non-zero vector
 *                    is returned that is not a solution to op(A) * x = b.
                     SCALE( RHS ) = ZERO
                     DO II = 1, N
                        X( II, KK ) = ZERO
                     END DO
 *                    Discard the local scale factors.
                     DO II = 1, NBA
                        WORK( II+KK*LDS ) = ONE
                     END DO
                     SCALOC = ONE
                  END IF
               END IF
               SCALOC = SCALOC * WORK( J+KK*LDS )
               WORK( J+KK*LDS ) = SCALOC
            END DO
 *
 *           Linear block updates
 *
            IF( NOTRAN ) THEN
               IF( UPPER ) THEN
                  IFIRST = J - 1
                  ILAST = 1
                  IINC = -1
               ELSE
                  IFIRST = J + 1
                  ILAST = NBA
                  IINC = 1
               END IF
            ELSE
               IF( UPPER ) THEN
                  IFIRST = J + 1
                  ILAST = NBA
                  IINC = 1
               ELSE
                  IFIRST = J - 1
                  ILAST = 1
                  IINC = -1
               END IF
            END IF
 *
            DO I = IFIRST, ILAST, IINC
 *              I1: row index of the first column in X( I, K )
 *              I2: row index of the first column in X( I+1, K )
 *              so the I2 - I1 is the row count of the block X( I, K )
               I1 = (I-1)*NB + 1
               I2 = MIN( I*NB, N ) + 1
 *
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
 *              rescale the column segments, if necesssary. Then
 *              the linear update is safely executed.
 *
               DO KK = 1, K2-K1
                  RHS = K1 + KK - 1
 *                 Compute consistent scaling
                  SCAMIN = MIN( WORK( I + KK*LDS), WORK( J + KK*LDS ) )
 *
 *                 Compute scaling factor to survive the linear update
 *                 simulating consistent scaling.
 *
                  BNRM = DLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
                  BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
                  XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS ))
                  ANRM = WORK( AWRK + I+(J-1)*NBA )
                  SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM )
 *
 *                 Simultaneously apply the robust update factor and the
 *                 consistency scaling factor to B( I, KK ) and B( J, KK ).
 *
                  SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
                  IF( SCAL.NE.ONE ) THEN
                     CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
                     WORK( I+KK*LDS ) = SCAMIN*SCALOC
                  END IF
 *
                  SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
                  IF( SCAL.NE.ONE ) THEN
                     CALL DSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
                     WORK( J+KK*LDS ) = SCAMIN*SCALOC
                  END IF
               END DO
 *
               IF( NOTRAN ) THEN
 *
 *                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
 *
                  CALL DGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE,
     $                        A( I1, J1 ), LDA, X( J1, K1 ), LDX,
     $                        ONE, X( I1, K1 ), LDX )
               ELSE
 *
 *                 B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K )
 *
                  CALL DGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE,
     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
     $                        ONE, X( I1, K1 ), LDX )
               END IF
            END DO
         END DO
 *
 *        Reduce local scaling factors
 *
         DO KK = 1, K2-K1
            RHS = K1 + KK - 1
            DO I = 1, NBA
               SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
            END DO
         END DO
 *
 *        Realize consistent scaling
 *
         DO KK = 1, K2-K1
            RHS = K1 + KK - 1
            IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
               DO I = 1, NBA
                  I1 = (I-1)*NB + 1
                  I2 = MIN( I*NB, N ) + 1
                  SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
                  IF( SCAL.NE.ONE )
     $               CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
               END DO
            END IF
         END DO
      END DO
      RETURN
 *
 *     End of DLATRS3
 *
      END
--- a/lapack-netlib/SRC/dtrsyl3.f
+++ b/lapack-netlib/SRC/dtrsyl3.f
--- a/lapack-netlib/SRC/ilaenv.f
+++ b/lapack-netlib/SRC/ilaenv.f
@@ -469,6 +469,15 @@
            ELSE
               NB = 64
            END IF
         ELSE IF( C3.EQ.'SYL' ) THEN
 *           The upper bound is to prevent overly aggressive scaling.
            IF( SNAME ) THEN
               NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ),
     $                   240 )
            ELSE
               NB = MIN( MAX( 24, INT( ( MIN( N1, N2 ) * 8 ) / 100) ),
     $                   80 )
            END IF
         END IF
      ELSE IF( C2.EQ.'LA' ) THEN
         IF( C3.EQ.'UUM' ) THEN
@@ -477,6 +486,12 @@
            ELSE
               NB = 64
            END IF
         ELSE IF( C3.EQ.'TRS' ) THEN
            IF( SNAME ) THEN
               NB = 32
            ELSE
               NB = 32
            END IF
         END IF
      ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN
         IF( C3.EQ.'EBZ' ) THEN
--- a/lapack-netlib/SRC/slarmm.f
+++ b/lapack-netlib/SRC/slarmm.f
@@ -0,0 +1,99 @@
 *> \brief \b SLARMM
 *
 * Definition:
 * ===========
 *
 *      REAL FUNCTION SLARMM( ANORM, BNORM, CNORM )
 *
 *     .. Scalar Arguments ..
 *      REAL               ANORM, BNORM, CNORM
 *     ..
 *
 *>  \par Purpose:
 *  =======
 *>
 *> \verbatim
 *>
 *> SLARMM returns a factor s in (0, 1] such that the linear updates
 *>
 *>    (s * C) - A * (s * B)  and  (s * C) - (s * A) * B
 *>
 *> cannot overflow, where A, B, and C are matrices of conforming
 *> dimensions.
 *>
 *> This is an auxiliary routine so there is no argument checking.
 *> \endverbatim
 *
 *  Arguments:
 *  =========
 *
 *> \param[in] ANORM
 *> \verbatim
 *>          ANORM is REAL
 *>          The infinity norm of A. ANORM >= 0.
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] BNORM
 *> \verbatim
 *>          BNORM is REAL
 *>          The infinity norm of B. BNORM >= 0.
 *> \endverbatim
 *>
 *> \param[in] CNORM
 *> \verbatim
 *>          CNORM is REAL
 *>          The infinity norm of C. CNORM >= 0.
 *> \endverbatim
 *>
 *>
 *  =====================================================================
 *>  References:
 *>    C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for
 *>    Robust Solution of Triangular Linear Systems. In: International
 *>    Conference on Parallel Processing and Applied Mathematics, pages
 *>    68--78. Springer, 2017.
 *>
 *> \ingroup OTHERauxiliary
 *  =====================================================================

      REAL FUNCTION SLARMM( ANORM, BNORM, CNORM )
      IMPLICIT NONE
 *     .. Scalar Arguments ..
      REAL               ANORM, BNORM, CNORM
 *     .. Parameters ..
      REAL               ONE, HALF, FOUR
      PARAMETER          ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 )
 *     ..
 *     .. Local Scalars ..
      REAL               BIGNUM, SMLNUM
 *     ..
 *     .. External Functions ..
      REAL               SLAMCH
      EXTERNAL           SLAMCH
 *     ..
 *     .. Executable Statements ..
 *
 *
 *     Determine machine dependent parameters to control overflow.
 *
      SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' )
      BIGNUM = ( ONE / SMLNUM ) / FOUR
 *
 *     Compute a scale factor.
 *
      SLARMM = ONE
      IF( BNORM .LE. ONE ) THEN
         IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN
            SLARMM = HALF
         END IF
      ELSE
         IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN
            SLARMM = HALF / BNORM
         END IF
      END IF
      RETURN
 *
 *     ==== End of SLARMM ====
 *
      END
--- a/lapack-netlib/SRC/slatrs3.f
+++ b/lapack-netlib/SRC/slatrs3.f
@@ -0,0 +1,656 @@
 *> \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
 *
 *  Definition:
 *  ===========
 *
 *      SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
 *                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          DIAG, NORMIN, TRANS, UPLO
 *       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
 *       ..
 *       .. Array Arguments ..
 *       REAL               A( LDA, * ), CNORM( * ), SCALE( * ), 
 *                          WORK( * ), X( LDX, * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> SLATRS3 solves one of the triangular systems
 *>
 *>    A * X = B * diag(scale)  or  A**T * X = B * diag(scale)
 *>
 *> with scaling to prevent overflow.  Here A is an upper or lower
 *> triangular matrix, A**T denotes the transpose of A. X and B are
 *> n by nrhs matrices and scale is an nrhs element vector of scaling
 *> factors. A scaling factor scale(j) is usually less than or equal
 *> to 1, chosen such that X(:,j) is less than the overflow threshold.
 *> If the matrix A is singular (A(j,j) = 0 for some j), then
 *> a non-trivial solution to A*X = 0 is returned. If the system is
 *> so badly scaled that the solution cannot be represented as
 *> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
 *>
 *> This is a BLAS-3 version of LATRS for solving several right
 *> hand sides simultaneously.
 *>
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] UPLO
 *> \verbatim
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the matrix A is upper or lower triangular.
 *>          = 'U':  Upper triangular
 *>          = 'L':  Lower triangular
 *> \endverbatim
 *>
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
 *>          Specifies the operation applied to A.
 *>          = 'N':  Solve A * x = s*b  (No transpose)
 *>          = 'T':  Solve A**T* x = s*b  (Transpose)
 *>          = 'C':  Solve A**T* x = s*b  (Conjugate transpose = Transpose)
 *> \endverbatim
 *>
 *> \param[in] DIAG
 *> \verbatim
 *>          DIAG is CHARACTER*1
 *>          Specifies whether or not the matrix A is unit triangular.
 *>          = 'N':  Non-unit triangular
 *>          = 'U':  Unit triangular
 *> \endverbatim
 *>
 *> \param[in] NORMIN
 *> \verbatim
 *>          NORMIN is CHARACTER*1
 *>          Specifies whether CNORM has been set or not.
 *>          = 'Y':  CNORM contains the column norms on entry
 *>          = 'N':  CNORM is not set on entry.  On exit, the norms will
 *>                  be computed and stored in CNORM.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] NRHS
 *> \verbatim
 *>          NRHS is INTEGER
 *>          The number of columns of X.  NRHS >= 0.
 *> \endverbatim
 *>
 *> \param[in] A
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
 *>          The triangular matrix A.  If UPLO = 'U', the leading n by n
 *>          upper triangular part of the array A contains the upper
 *>          triangular matrix, and the strictly lower triangular part of
 *>          A is not referenced.  If UPLO = 'L', the leading n by n lower
 *>          triangular part of the array A contains the lower triangular
 *>          matrix, and the strictly upper triangular part of A is not
 *>          referenced.  If DIAG = 'U', the diagonal elements of A are
 *>          also not referenced and are assumed to be 1.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max (1,N).
 *> \endverbatim
 *>
 *> \param[in,out] X
 *> \verbatim
 *>          X is REAL array, dimension (LDX,NRHS)
 *>          On entry, the right hand side B of the triangular system.
 *>          On exit, X is overwritten by the solution matrix X.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
 *>          The leading dimension of the array X.  LDX >= max (1,N).
 *> \endverbatim
 *>
 *> \param[out] SCALE
 *> \verbatim
 *>          SCALE is REAL array, dimension (NRHS)
 *>          The scaling factor s(k) is for the triangular system
 *>          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k).
 *>          If SCALE = 0, the matrix A is singular or badly scaled.
 *>          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
 *>          that is an exact or approximate solution to A*x(:,k) = 0
 *>          is returned. If the system so badly scaled that solution
 *>          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
 *>          is returned.
 *> \endverbatim
 *>
 *> \param[in,out] CNORM
 *> \verbatim
 *>          CNORM is REAL array, dimension (N)
 *>
 *>          If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
 *>          contains the norm of the off-diagonal part of the j-th column
 *>          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
 *>          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
 *>          must be greater than or equal to the 1-norm.
 *>
 *>          If NORMIN = 'N', CNORM is an output argument and CNORM(j)
 *>          returns the 1-norm of the offdiagonal part of the j-th column
 *>          of A.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (LWORK).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *>          LWORK is INTEGER
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -k, the k-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERauxiliary
 *> \par Further Details:
 *  =====================
 *  \verbatim
 *  The algorithm follows the structure of a block triangular solve.
 *  The diagonal block is solved with a call to the robust the triangular
 *  solver LATRS for every right-hand side RHS = 1, ..., NRHS
 *     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
 *  where op( A ) = A or op( A ) = A**T.
 *  The linear block updates operate on block columns of X,
 *     B( I, K ) - op(A( I, J )) * X( J, K )
 *  and use GEMM. To avoid overflow in the linear block update, the worst case
 *  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
 *  such that
 *     || s * B( I, RHS )||_oo
 *   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold
 *
 *  Once all columns of a block column have been rescaled (BLAS-1), the linear
 *  update is executed with GEMM without overflow.
 *
 *  To limit rescaling, local scale factors track the scaling of column segments.
 *  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
 *  per right-hand side column RHS = 1, ..., NRHS. The global scale factor
 *  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
 *  I = 1, ..., NBA.
 *  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
 *  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
 *  linear update of potentially inconsistently scaled vector segments
 *     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
 *  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
 *  if necessary, rescales the blocks prior to calling GEMM.
 *
 *  \endverbatim
 *  =====================================================================
 *  References:
 *  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
 *  Parallel robust solution of triangular linear systems. Concurrency
 *  and Computation: Practice and Experience, 31(19), e5064.
 *
 *  Contributor:
 *   Angelika Schwarz, Umea University, Sweden.
 *
 *  =====================================================================
      SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
     $                    X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
      IMPLICIT NONE
 *
 *     .. Scalar Arguments ..
      CHARACTER          DIAG, TRANS, NORMIN, UPLO
      INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
 *     ..
 *     .. Array Arguments ..
      REAL               A( LDA, * ), CNORM( * ), X( LDX, * ),
     $                   SCALE( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      REAL               ZERO, ONE
      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
      INTEGER            NBMAX, NBMIN, NBRHS, NRHSMIN
      PARAMETER          ( NRHSMIN = 2, NBRHS = 32 )
      PARAMETER          ( NBMIN = 8, NBMAX = 64 )
 *     ..
 *     .. Local Arrays ..
      REAL               W( NBMAX ), XNRM( NBRHS )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
      INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
     $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
      REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
     $                   SCAMIN, SMLNUM, TMAX
 *     ..
 *     .. External Functions ..
      LOGICAL            LSAME
      INTEGER            ILAENV
      REAL               SLAMCH, SLANGE, SLARMM
      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANGE, SLARMM
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SLATRS, SSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          ABS, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
      INFO = 0
      UPPER = LSAME( UPLO, 'U' )
      NOTRAN = LSAME( TRANS, 'N' )
      NOUNIT = LSAME( DIAG, 'N' )
      LQUERY = ( LWORK.EQ.-1 )
 *
 *     Partition A and X into blocks.
 *
      NB = MAX( 8, ILAENV( 1, 'SLATRS', '', N, N, -1, -1 ) )
      NB = MIN( NBMAX, NB )
      NBA = MAX( 1, (N + NB - 1) / NB )
      NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
 *
 *     Compute the workspace
 *
 *     The workspace comprises two parts.
 *     The first part stores the local scale factors. Each simultaneously
 *     computed right-hand side requires one local scale factor per block
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
      LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
      LDS = NBA
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
      LANRM = NBA * NBA
      AWRK = LSCALE
      WORK( 1 ) = LSCALE + LANRM
 *
 *     Test the input parameters.
 *
      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
         INFO = -1
      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
     $         LSAME( TRANS, 'C' ) ) THEN
         INFO = -2
      ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
         INFO = -3
      ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
     $         LSAME( NORMIN, 'N' ) ) THEN
         INFO = -4
      ELSE IF( N.LT.0 ) THEN
         INFO = -5
      ELSE IF( NRHS.LT.0 ) THEN
         INFO = -6
      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
         INFO = -8
      ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
         INFO = -10
      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
         INFO = -14
      END IF
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLATRS3', -INFO )
         RETURN
      ELSE IF( LQUERY ) THEN
         RETURN
      END IF
 *
 *     Initialize scaling factors
 *
      DO KK = 1, NRHS
         SCALE( KK ) = ONE
      END DO
 *
 *     Quick return if possible
 *
      IF( MIN( N, NRHS ).EQ.0 )
     $   RETURN
 *
 *     Determine machine dependent constant to control overflow.
 *
      BIGNUM = SLAMCH( 'Overflow' )
      SMLNUM = SLAMCH( 'Safe Minimum' )
 *
 *     Use unblocked code for small problems
 *
      IF( NRHS.LT.NRHSMIN ) THEN
         CALL SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1),
     $                SCALE( 1 ), CNORM, INFO )
         DO K = 2, NRHS
            CALL SLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
     $                   SCALE( K ), CNORM, INFO )
         END DO
         RETURN
      END IF
 *
 *     Compute norms of blocks of A excluding diagonal blocks and find
 *     the block with the largest norm TMAX.
 *
      TMAX = ZERO
      DO J = 1, NBA
         J1 = (J-1)*NB + 1
         J2 = MIN( J*NB, N ) + 1
         IF ( UPPER ) THEN
            IFIRST = 1
            ILAST = J - 1
         ELSE
            IFIRST = J + 1
            ILAST = NBA
         END IF
         DO I = IFIRST, ILAST
            I1 = (I-1)*NB + 1
            I2 = MIN( I*NB, N ) + 1
 *
 *           Compute upper bound of A( I1:I2-1, J1:J2-1 ).
 *
            IF( NOTRAN ) THEN
               ANRM = SLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
               WORK( AWRK + I+(J-1)*NBA ) = ANRM
            ELSE
               ANRM = SLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
               WORK( AWRK + J+(I-1)*NBA ) = ANRM
            END IF
            TMAX = MAX( TMAX, ANRM )
         END DO
      END DO
 *
      IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN
 *
 *        Some matrix entries have huge absolute value. At least one upper
 *        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
 *        number, either due to overflow in LANGE or due to Inf in A.
 *        Fall back to LATRS. Set normin = 'N' for every right-hand side to
 *        force computation of TSCAL in LATRS to avoid the likely overflow
 *        in the computation of the column norms CNORM.
 *
         DO K = 1, NRHS
            CALL SLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
     $                   SCALE( K ), CNORM, INFO )
         END DO
         RETURN
      END IF
 *
 *     Every right-hand side requires workspace to store NBA local scale
 *     factors. To save workspace, X is computed successively in block columns
 *     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
 *     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
      DO K = 1, NBX
 *        Loop over block columns (index = K) of X and, for column-wise scalings,
 *        over individual columns (index = KK).
 *        K1: column index of the first column in X( J, K )
 *        K2: column index of the first column in X( J, K+1 )
 *        so the K2 - K1 is the column count of the block X( J, K )
         K1 = (K-1)*NBRHS + 1
         K2 = MIN( K*NBRHS, NRHS ) + 1
 *
 *        Initialize local scaling factors of current block column X( J, K )
 *
         DO KK = 1, K2 - K1
            DO I = 1, NBA
               WORK( I+KK*LDS ) = ONE
            END DO
         END DO
 *
         IF( NOTRAN ) THEN
 *
 *           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
 *
            IF( UPPER ) THEN
               JFIRST = NBA
               JLAST = 1
               JINC = -1
            ELSE
               JFIRST = 1
               JLAST = NBA
               JINC = 1
            END IF
         ELSE
 *
 *           Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
 *
            IF( UPPER ) THEN
               JFIRST = 1
               JLAST = NBA
               JINC = 1
            ELSE
               JFIRST = NBA
               JLAST = 1
               JINC = -1
            END IF
         END IF
 *
         DO J = JFIRST, JLAST, JINC
 *           J1: row index of the first row in A( J, J )
 *           J2: row index of the first row in A( J+1, J+1 )
 *           so that J2 - J1 is the row count of the block A( J, J )
            J1 = (J-1)*NB + 1
            J2 = MIN( J*NB, N ) + 1
 *
 *           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
 *           for all right-hand sides in the current block column,
 *           one RHS at a time.
 *
            DO KK = 1, K2-K1
               RHS = K1 + KK - 1
               IF( KK.EQ.1 ) THEN
                  CALL SLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
     $                         A( J1, J1 ), LDA, X( J1, RHS ),
     $                         SCALOC, CNORM, INFO )
               ELSE
                  CALL SLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
     $                         A( J1, J1 ), LDA, X( J1, RHS ),
     $                         SCALOC, CNORM, INFO )
               END IF
 *              Find largest absolute value entry in the vector segment
 *              X( J1:J2-1, RHS ) as an upper bound for the worst case
 *              growth in the linear updates.
               XNRM( KK ) = SLANGE( 'I', J2-J1, 1, X( J1, RHS ),
     $                              LDX, W )
 *
               IF( SCALOC .EQ. ZERO ) THEN
 *                 LATRS found that A is singular through A(j,j) = 0.
 *                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
 *                 and compute A*x = 0 (or A**T*x = 0). Note that
 *                 X(J1:J2-1, KK) is set by LATRS.
                  SCALE( RHS ) = ZERO
                  DO II = 1, J1-1
                     X( II, KK ) = ZERO
                  END DO
                  DO II = J2, N
                     X( II, KK ) = ZERO
                  END DO
 *                 Discard the local scale factors.
                  DO II = 1, NBA
                     WORK( II+KK*LDS ) = ONE
                  END DO
                  SCALOC = ONE
               ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN
 *                 LATRS computed a valid scale factor, but combined with
 *                 the current scaling the solution does not have a
 *                 scale factor > 0.
 *
 *                 Set WORK( J+KK*LDS ) to smallest valid scale
 *                 factor and increase SCALOC accordingly.
                  SCAL = WORK( J+KK*LDS ) / SMLNUM
                  SCALOC = SCALOC * SCAL
                  WORK( J+KK*LDS ) = SMLNUM
 *                 If LATRS overestimated the growth, x may be
 *                 rescaled to preserve a valid combined scale
 *                 factor WORK( J, KK ) > 0.
                  RSCAL = ONE / SCALOC
                  IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN
                     XNRM( KK ) = XNRM( KK ) * RSCAL
                     CALL SSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
                     SCALOC = ONE
                  ELSE
 *                    The system op(A) * x = b is badly scaled and its
 *                    solution cannot be represented as (1/scale) * x.
 *                    Set x to zero. This approach deviates from LATRS
 *                    where a completely meaningless non-zero vector
 *                    is returned that is not a solution to op(A) * x = b.
                     SCALE( RHS ) = ZERO
                     DO II = 1, N
                        X( II, KK ) = ZERO
                     END DO
 *                    Discard the local scale factors.
                     DO II = 1, NBA
                        WORK( II+KK*LDS ) = ONE
                     END DO
                     SCALOC = ONE
                  END IF
               END IF
               SCALOC = SCALOC * WORK( J+KK*LDS )
               WORK( J+KK*LDS ) = SCALOC
            END DO
 *
 *           Linear block updates
 *
            IF( NOTRAN ) THEN
               IF( UPPER ) THEN
                  IFIRST = J - 1
                  ILAST = 1
                  IINC = -1
               ELSE
                  IFIRST = J + 1
                  ILAST = NBA
                  IINC = 1
               END IF
            ELSE
               IF( UPPER ) THEN
                  IFIRST = J + 1
                  ILAST = NBA
                  IINC = 1
               ELSE
                  IFIRST = J - 1
                  ILAST = 1
                  IINC = -1
               END IF
            END IF
 *
            DO I = IFIRST, ILAST, IINC
 *              I1: row index of the first column in X( I, K )
 *              I2: row index of the first column in X( I+1, K )
 *              so the I2 - I1 is the row count of the block X( I, K )
               I1 = (I-1)*NB + 1
               I2 = MIN( I*NB, N ) + 1
 *
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
 *              rescale the column segments, if necesssary. Then
 *              the linear update is safely executed.
 *
               DO KK = 1, K2-K1
                  RHS = K1 + KK - 1
 *                 Compute consistent scaling
                  SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) )
 *
 *                 Compute scaling factor to survive the linear update
 *                 simulating consistent scaling.
 *
                  BNRM = SLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
                  BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
                  XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS ))
                  ANRM = WORK( AWRK + I+(J-1)*NBA )
                  SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM )
 *
 *                 Simultaneously apply the robust update factor and the
 *                 consistency scaling factor to B( I, KK ) and B( J, KK ).
 *
                  SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
                  IF( SCAL.NE.ONE ) THEN
                     CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
                     WORK( I+KK*LDS ) = SCAMIN*SCALOC
                  END IF
 *
                  SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
                  IF( SCAL.NE.ONE ) THEN
                     CALL SSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
                     WORK( J+KK*LDS ) = SCAMIN*SCALOC
                  END IF
               END DO
 *
               IF( NOTRAN ) THEN
 *
 *                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
 *
                  CALL SGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE,
     $                        A( I1, J1 ), LDA, X( J1, K1 ), LDX,
     $                        ONE, X( I1, K1 ), LDX )
               ELSE
 *
 *                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
 *
                  CALL SGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE,
     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
     $                        ONE, X( I1, K1 ), LDX )
               END IF
            END DO
         END DO
 *
 *        Reduce local scaling factors
 *
         DO KK = 1, K2-K1
            RHS = K1 + KK - 1
            DO I = 1, NBA
               SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
            END DO
         END DO
 *
 *        Realize consistent scaling
 *
         DO KK = 1, K2-K1
            RHS = K1 + KK - 1
            IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
               DO I = 1, NBA
                  I1 = (I-1)*NB + 1
                  I2 = MIN( I*NB, N ) + 1
                  SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
                  IF( SCAL.NE.ONE )
     $               CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
               END DO
            END IF
         END DO
      END DO
      RETURN
 *
 *     End of SLATRS3
 *
      END
--- a/lapack-netlib/SRC/strsyl3.f
+++ b/lapack-netlib/SRC/strsyl3.f
--- a/lapack-netlib/SRC/zlatrs3.f
+++ b/lapack-netlib/SRC/zlatrs3.f
@@ -0,0 +1,667 @@
 *> \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
 *
 *  Definition:
 *  ===========
 *
 *      SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
 *                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          DIAG, NORMIN, TRANS, UPLO
 *       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
 *       ..
 *       .. Array Arguments ..
 *       DOUBLE PRECISION   CNORM( * ), SCALE( * ), WORK( * )
 *       COMPLEX*16         A( LDA, * ), X( LDX, * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> ZLATRS3 solves one of the triangular systems
 *>
 *>    A * X = B * diag(scale),  A**T * X = B * diag(scale), or
 *>    A**H * X = B * diag(scale)
 *>
 *> with scaling to prevent overflow.  Here A is an upper or lower
 *> triangular matrix, A**T denotes the transpose of A, A**H denotes the
 *> conjugate transpose of A. X and B are n-by-nrhs matrices and scale
 *> is an nrhs-element vector of scaling factors. A scaling factor scale(j)
 *> is usually less than or equal to 1, chosen such that X(:,j) is less
 *> than the overflow threshold. If the matrix A is singular (A(j,j) = 0
 *> for some j), then a non-trivial solution to A*X = 0 is returned. If
 *> the system is so badly scaled that the solution cannot be represented
 *> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
 *>
 *> This is a BLAS-3 version of LATRS for solving several right
 *> hand sides simultaneously.
 *>
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] UPLO
 *> \verbatim
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the matrix A is upper or lower triangular.
 *>          = 'U':  Upper triangular
 *>          = 'L':  Lower triangular
 *> \endverbatim
 *>
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
 *>          Specifies the operation applied to A.
 *>          = 'N':  Solve A * x = s*b  (No transpose)
 *>          = 'T':  Solve A**T* x = s*b  (Transpose)
 *>          = 'C':  Solve A**T* x = s*b  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] DIAG
 *> \verbatim
 *>          DIAG is CHARACTER*1
 *>          Specifies whether or not the matrix A is unit triangular.
 *>          = 'N':  Non-unit triangular
 *>          = 'U':  Unit triangular
 *> \endverbatim
 *>
 *> \param[in] NORMIN
 *> \verbatim
 *>          NORMIN is CHARACTER*1
 *>          Specifies whether CNORM has been set or not.
 *>          = 'Y':  CNORM contains the column norms on entry
 *>          = 'N':  CNORM is not set on entry.  On exit, the norms will
 *>                  be computed and stored in CNORM.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] NRHS
 *> \verbatim
 *>          NRHS is INTEGER
 *>          The number of columns of X.  NRHS >= 0.
 *> \endverbatim
 *>
 *> \param[in] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>          The triangular matrix A.  If UPLO = 'U', the leading n by n
 *>          upper triangular part of the array A contains the upper
 *>          triangular matrix, and the strictly lower triangular part of
 *>          A is not referenced.  If UPLO = 'L', the leading n by n lower
 *>          triangular part of the array A contains the lower triangular
 *>          matrix, and the strictly upper triangular part of A is not
 *>          referenced.  If DIAG = 'U', the diagonal elements of A are
 *>          also not referenced and are assumed to be 1.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max (1,N).
 *> \endverbatim
 *>
 *> \param[in,out] X
 *> \verbatim
 *>          X is COMPLEX*16 array, dimension (LDX,NRHS)
 *>          On entry, the right hand side B of the triangular system.
 *>          On exit, X is overwritten by the solution matrix X.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
 *>          The leading dimension of the array X.  LDX >= max (1,N).
 *> \endverbatim
 *>
 *> \param[out] SCALE
 *> \verbatim
 *>          SCALE is DOUBLE PRECISION array, dimension (NRHS)
 *>          The scaling factor s(k) is for the triangular system
 *>          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k).
 *>          If SCALE = 0, the matrix A is singular or badly scaled.
 *>          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
 *>          that is an exact or approximate solution to A*x(:,k) = 0
 *>          is returned. If the system so badly scaled that solution
 *>          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
 *>          is returned.
 *> \endverbatim
 *>
 *> \param[in,out] CNORM
 *> \verbatim
 *>          CNORM is DOUBLE PRECISION array, dimension (N)
 *>
 *>          If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
 *>          contains the norm of the off-diagonal part of the j-th column
 *>          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
 *>          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
 *>          must be greater than or equal to the 1-norm.
 *>
 *>          If NORMIN = 'N', CNORM is an output argument and CNORM(j)
 *>          returns the 1-norm of the offdiagonal part of the j-th column
 *>          of A.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (LWORK).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *>          LWORK is INTEGER
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -k, the k-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERauxiliary
 *> \par Further Details:
 *  =====================
 *  \verbatim
 *  The algorithm follows the structure of a block triangular solve.
 *  The diagonal block is solved with a call to the robust the triangular
 *  solver LATRS for every right-hand side RHS = 1, ..., NRHS
 *     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
 *  where op( A ) = A or op( A ) = A**T or op( A ) = A**H.
 *  The linear block updates operate on block columns of X,
 *     B( I, K ) - op(A( I, J )) * X( J, K )
 *  and use GEMM. To avoid overflow in the linear block update, the worst case
 *  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
 *  such that
 *     || s * B( I, RHS )||_oo
 *   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold
 *
 *  Once all columns of a block column have been rescaled (BLAS-1), the linear
 *  update is executed with GEMM without overflow.
 *
 *  To limit rescaling, local scale factors track the scaling of column segments.
 *  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
 *  per right-hand side column RHS = 1, ..., NRHS. The global scale factor
 *  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
 *  I = 1, ..., NBA.
 *  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
 *  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
 *  linear update of potentially inconsistently scaled vector segments
 *     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
 *  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
 *  if necessary, rescales the blocks prior to calling GEMM.
 *
 *  \endverbatim
 *  =====================================================================
 *  References:
 *  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
 *  Parallel robust solution of triangular linear systems. Concurrency
 *  and Computation: Practice and Experience, 31(19), e5064.
 *
 *  Contributor:
 *   Angelika Schwarz, Umea University, Sweden.
 *
 *  =====================================================================
      SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
     $                    X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
      IMPLICIT NONE
 *
 *     .. Scalar Arguments ..
      CHARACTER          DIAG, TRANS, NORMIN, UPLO
      INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
 *     ..
 *     .. Array Arguments ..
      COMPLEX*16         A( LDA, * ), X( LDX, * )
      DOUBLE PRECISION   CNORM( * ), SCALE( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ZERO, ONE
      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
      COMPLEX*16         CZERO, CONE
      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ) )
      INTEGER            NBMAX, NBMIN, NBRHS, NRHSMIN
      PARAMETER          ( NRHSMIN = 2, NBRHS = 32 )
      PARAMETER          ( NBMIN = 8, NBMAX = 64 )
 *     ..
 *     .. Local Arrays ..
      DOUBLE PRECISION   W( NBMAX ), XNRM( NBRHS )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
      INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
     $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
      DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
     $                   SCAMIN, SMLNUM, TMAX
 *     ..
 *     .. External Functions ..
      LOGICAL            LSAME
      INTEGER            ILAENV
      DOUBLE PRECISION   DLAMCH, ZLANGE, DLARMM
      EXTERNAL           ILAENV, LSAME, DLAMCH, ZLANGE, DLARMM
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           ZLATRS, ZDSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          ABS, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
      INFO = 0
      UPPER = LSAME( UPLO, 'U' )
      NOTRAN = LSAME( TRANS, 'N' )
      NOUNIT = LSAME( DIAG, 'N' )
      LQUERY = ( LWORK.EQ.-1 )
 *
 *     Partition A and X into blocks.
 *
      NB = MAX( NBMIN, ILAENV( 1, 'ZLATRS', '', N, N, -1, -1 ) )
      NB = MIN( NBMAX, NB )
      NBA = MAX( 1, (N + NB - 1) / NB )
      NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
 *
 *     Compute the workspace
 *
 *     The workspace comprises two parts.
 *     The first part stores the local scale factors. Each simultaneously
 *     computed right-hand side requires one local scale factor per block
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
      LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
      LDS = NBA
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
      LANRM = NBA * NBA
      AWRK = LSCALE
      WORK( 1 ) = LSCALE + LANRM
 *
 *     Test the input parameters.
 *
      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
         INFO = -1
      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
     $         LSAME( TRANS, 'C' ) ) THEN
         INFO = -2
      ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
         INFO = -3
      ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
     $         LSAME( NORMIN, 'N' ) ) THEN
         INFO = -4
      ELSE IF( N.LT.0 ) THEN
         INFO = -5
      ELSE IF( NRHS.LT.0 ) THEN
         INFO = -6
      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
         INFO = -8
      ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
         INFO = -10
      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
         INFO = -14
      END IF
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLATRS3', -INFO )
         RETURN
      ELSE IF( LQUERY ) THEN
         RETURN
      END IF
 *
 *     Initialize scaling factors
 *
      DO KK = 1, NRHS
         SCALE( KK ) = ONE
      END DO
 *
 *     Quick return if possible
 *
      IF( MIN( N, NRHS ).EQ.0 )
     $   RETURN
 *
 *     Determine machine dependent constant to control overflow.
 *
      BIGNUM = DLAMCH( 'Overflow' )
      SMLNUM = DLAMCH( 'Safe Minimum' )
 *
 *     Use unblocked code for small problems
 *
      IF( NRHS.LT.NRHSMIN ) THEN
         CALL ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1),
     $                SCALE( 1 ), CNORM, INFO )
         DO K = 2, NRHS
            CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
     $                   SCALE( K ), CNORM, INFO )
         END DO
         RETURN
      END IF
 *
 *     Compute norms of blocks of A excluding diagonal blocks and find
 *     the block with the largest norm TMAX.
 *
      TMAX = ZERO
      DO J = 1, NBA
         J1 = (J-1)*NB + 1
         J2 = MIN( J*NB, N ) + 1
         IF ( UPPER ) THEN
            IFIRST = 1
            ILAST = J - 1
         ELSE
            IFIRST = J + 1
            ILAST = NBA
         END IF
         DO I = IFIRST, ILAST
            I1 = (I-1)*NB + 1
            I2 = MIN( I*NB, N ) + 1
 *
 *           Compute upper bound of A( I1:I2-1, J1:J2-1 ).
 *
            IF( NOTRAN ) THEN
               ANRM = ZLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
               WORK( AWRK + I+(J-1)*NBA ) = ANRM
            ELSE
               ANRM = ZLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
               WORK( AWRK + J+(I-1) * NBA ) = ANRM
            END IF
            TMAX = MAX( TMAX, ANRM )
         END DO
      END DO
 *
      IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN
 *
 *        Some matrix entries have huge absolute value. At least one upper
 *        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
 *        number, either due to overflow in LANGE or due to Inf in A.
 *        Fall back to LATRS. Set normin = 'N' for every right-hand side to
 *        force computation of TSCAL in LATRS to avoid the likely overflow
 *        in the computation of the column norms CNORM.
 *
         DO K = 1, NRHS
            CALL ZLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
     $                   SCALE( K ), CNORM, INFO )
         END DO
         RETURN
      END IF
 *
 *     Every right-hand side requires workspace to store NBA local scale
 *     factors. To save workspace, X is computed successively in block columns
 *     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
 *     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
      DO K = 1, NBX
 *        Loop over block columns (index = K) of X and, for column-wise scalings,
 *        over individual columns (index = KK).
 *        K1: column index of the first column in X( J, K )
 *        K2: column index of the first column in X( J, K+1 )
 *        so the K2 - K1 is the column count of the block X( J, K )
         K1 = (K-1)*NBRHS + 1
         K2 = MIN( K*NBRHS, NRHS ) + 1
 *
 *        Initialize local scaling factors of current block column X( J, K )
 *
         DO KK = 1, K2 - K1
            DO I = 1, NBA
               WORK( I+KK*LDS ) = ONE
            END DO
         END DO
 *
         IF( NOTRAN ) THEN
 *
 *           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
 *
            IF( UPPER ) THEN
               JFIRST = NBA
               JLAST = 1
               JINC = -1
            ELSE
               JFIRST = 1
               JLAST = NBA
               JINC = 1
            END IF
         ELSE
 *
 *           Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
 *           where op(A) = A**T or op(A) = A**H
 *
            IF( UPPER ) THEN
               JFIRST = 1
               JLAST = NBA
               JINC = 1
            ELSE
               JFIRST = NBA
               JLAST = 1
               JINC = -1
            END IF
         END IF

         DO J = JFIRST, JLAST, JINC
 *           J1: row index of the first row in A( J, J )
 *           J2: row index of the first row in A( J+1, J+1 )
 *           so that J2 - J1 is the row count of the block A( J, J )
            J1 = (J-1)*NB + 1
            J2 = MIN( J*NB, N ) + 1
 *
 *           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
 *
            DO KK = 1, K2 - K1
               RHS = K1 + KK - 1
               IF( KK.EQ.1 ) THEN
                  CALL ZLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
     $                         A( J1, J1 ), LDA, X( J1, RHS ),
     $                         SCALOC, CNORM, INFO )
               ELSE
                  CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
     $                         A( J1, J1 ), LDA, X( J1, RHS ),
     $                         SCALOC, CNORM, INFO )
               END IF
 *              Find largest absolute value entry in the vector segment
 *              X( J1:J2-1, RHS ) as an upper bound for the worst case
 *              growth in the linear updates.
               XNRM( KK ) = ZLANGE( 'I', J2-J1, 1, X( J1, RHS ),
     $                              LDX, W )
 *
               IF( SCALOC .EQ. ZERO ) THEN
 *                 LATRS found that A is singular through A(j,j) = 0.
 *                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
 *                 and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is
 *                 set by LATRS.
                  SCALE( RHS ) = ZERO
                  DO II = 1, J1-1
                     X( II, KK ) = CZERO
                  END DO
                  DO II = J2, N
                     X( II, KK ) = CZERO
                  END DO
 *                 Discard the local scale factors.
                  DO II = 1, NBA
                     WORK( II+KK*LDS ) = ONE
                  END DO
                  SCALOC = ONE
               ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN
 *                 LATRS computed a valid scale factor, but combined with
 *                 the current scaling the solution does not have a
 *                 scale factor > 0.
 *
 *                 Set WORK( J+KK*LDS ) to smallest valid scale
 *                 factor and increase SCALOC accordingly.
                  SCAL = WORK( J+KK*LDS ) / SMLNUM
                  SCALOC = SCALOC * SCAL
                  WORK( J+KK*LDS ) = SMLNUM
 *                 If LATRS overestimated the growth, x may be
 *                 rescaled to preserve a valid combined scale
 *                 factor WORK( J, KK ) > 0.
                  RSCAL = ONE / SCALOC
                  IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN
                     XNRM( KK ) = XNRM( KK ) * RSCAL
                     CALL ZDSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
                     SCALOC = ONE
                  ELSE
 *                    The system op(A) * x = b is badly scaled and its
 *                    solution cannot be represented as (1/scale) * x.
 *                    Set x to zero. This approach deviates from LATRS
 *                    where a completely meaningless non-zero vector
 *                    is returned that is not a solution to op(A) * x = b.
                     SCALE( RHS ) = ZERO
                     DO II = 1, N
                        X( II, KK ) = CZERO
                     END DO
 *                    Discard the local scale factors.
                     DO II = 1, NBA
                        WORK( II+KK*LDS ) = ONE
                     END DO
                     SCALOC = ONE
                  END IF
               END IF
               SCALOC = SCALOC * WORK( J+KK*LDS )
               WORK( J+KK*LDS ) = SCALOC
            END DO
 *
 *           Linear block updates
 *
            IF( NOTRAN ) THEN
               IF( UPPER ) THEN
                  IFIRST = J - 1
                  ILAST = 1
                  IINC = -1
               ELSE
                  IFIRST = J + 1
                  ILAST = NBA
                  IINC = 1
               END IF
            ELSE
               IF( UPPER ) THEN
                  IFIRST = J + 1
                  ILAST = NBA
                  IINC = 1
               ELSE
                  IFIRST = J - 1
                  ILAST = 1
                  IINC = -1
               END IF
            END IF
 *
            DO I = IFIRST, ILAST, IINC
 *              I1: row index of the first column in X( I, K )
 *              I2: row index of the first column in X( I+1, K )
 *              so the I2 - I1 is the row count of the block X( I, K )
               I1 = (I-1)*NB + 1
               I2 = MIN( I*NB, N ) + 1
 *
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
 *              rescale the column segments, if necesssary. Then
 *              the linear update is safely executed.
 *
               DO KK = 1, K2 - K1
                  RHS = K1 + KK - 1
 *                 Compute consistent scaling
                  SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) )
 *
 *                 Compute scaling factor to survive the linear update
 *                 simulating consistent scaling.
 *
                  BNRM = ZLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
                  BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
                  XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) )
                  ANRM = WORK( AWRK + I+(J-1)*NBA )
                  SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM )
 *
 *                 Simultaneously apply the robust update factor and the
 *                 consistency scaling factor to X( I, KK ) and X( J, KK ).
 *
                  SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
                  IF( SCAL.NE.ONE ) THEN
                     CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
                     WORK( I+KK*LDS ) = SCAMIN*SCALOC
                  END IF
 *
                  SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
                  IF( SCAL.NE.ONE ) THEN
                     CALL ZDSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
                     WORK( J+KK*LDS ) = SCAMIN*SCALOC
                  END IF
               END DO
 *
               IF( NOTRAN ) THEN
 *
 *                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
 *
                  CALL ZGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE,
     $                        A( I1, J1 ), LDA, X( J1, K1 ), LDX,
     $                        CONE, X( I1, K1 ), LDX )
               ELSE IF( LSAME( TRANS, 'T' ) ) THEN
 *
 *                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
 *
                  CALL ZGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE,
     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
     $                        CONE, X( I1, K1 ), LDX )
               ELSE
 *
 *                 B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K )
 *
                  CALL ZGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE,
     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
     $                        CONE, X( I1, K1 ), LDX )
               END IF
            END DO
         END DO

 *
 *        Reduce local scaling factors
 *
         DO KK = 1, K2 - K1
            RHS = K1 + KK - 1
            DO I = 1, NBA
               SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
            END DO
         END DO
 *
 *        Realize consistent scaling
 *
         DO KK = 1, K2 - K1
            RHS = K1 + KK - 1
            IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
               DO I = 1, NBA
                  I1 = (I - 1) * NB + 1
                  I2 = MIN( I * NB, N ) + 1
                  SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
                  IF( SCAL.NE.ONE )
     $               CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
               END DO
            END IF
         END DO
      END DO
      RETURN
 *
 *     End of ZLATRS3
 *
      END
--- a/lapack-netlib/SRC/ztrsyl3.f
+++ b/lapack-netlib/SRC/ztrsyl3.f